From 381b0b0e678d12940b0a8004573dacae31931b9a Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Tue, 5 Mar 2024 23:23:36 +0800
Subject: [PATCH 001/114] [PIR] support wrap_type_interface. (#62422)

---
 .../pir/dialect/distributed/ir/dist_type.cc   |  4 +--
 .../pir/dialect/distributed/ir/dist_type.h    | 14 +++++---
 .../pir/dialect/distributed/ir/type_storage.h | 14 ++++----
 paddle/pir/include/core/builtin_type.h        |  9 +++++
 .../include/core/builtin_type_interfaces.h    | 25 ++++++++++++++
 .../include/core/storage_manager_support.h    |  2 +-
 paddle/pir/src/core/builtin_type.cc           | 19 +++++++++++
 .../pir/src/core/builtin_type_interfaces.cc   |  1 +
 test/cpp/pir/distributed/dist_dialect_test.cc | 34 +++++++++++++++++++
 9 files changed, 107 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
index 94a2d85fbcdd7..5044fb5b0b5c2 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
@@ -26,8 +26,8 @@ TensorDistAttribute DistDenseTensorType::tensor_dist_attr() const {
   return storage()->tensor_dist_attr;
 }
 
-const common::DDim& DistDenseTensorType::global_ddim() const {
-  return storage()->global_ddim;
+const common::DDim& DistDenseTensorType::local_ddim() const {
+  return storage()->local_ddim;
 }
 
 DistDenseTensorType DistDenseTensorType::get(
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_type.h b/paddle/fluid/pir/dialect/distributed/ir/dist_type.h
index bfcd92d30cb37..7b35c52c7ea58 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_type.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_type.h
@@ -24,18 +24,22 @@ namespace dialect {
 class DistDenseTensorTypeStorage;
 
 class DistDenseTensorType
-    : public pir::Type::
-          TypeBase<DistDenseTensorType, pir::Type, DistDenseTensorTypeStorage> {
+    : public pir::Type::TypeBase<DistDenseTensorType,
+                                 pir::Type,
+                                 DistDenseTensorTypeStorage,
+                                 pir::WrapTypeInterface> {
  public:
   using Base::Base;
 
   pir::DenseTensorType dense_tensor_type() const;
   TensorDistAttribute tensor_dist_attr() const;
-  const common::DDim& global_ddim() const;
-  const common::DDim& local_ddim() const { return dense_tensor_type().dims(); }
+  const common::DDim& global_ddim() const { return dense_tensor_type().dims(); }
+  const common::DDim& local_ddim() const;
   Type dtype() const { return dense_tensor_type().dtype(); }
   DataLayout data_layout() const { return dense_tensor_type().data_layout(); }
 
+  Type prim_type() { return dense_tensor_type(); }
+
   ProcessMeshAttribute process_mesh_attr() const {
     return tensor_dist_attr().process_mesh_attr();
   }
@@ -52,7 +56,7 @@ class DistDenseTensorType
   static DistDenseTensorType get(pir::IrContext* ctx,
                                  pir::DenseTensorType dense_tensor_type,
                                  TensorDistAttribute tensor_dist_attr,
-                                 const common::DDim& global_ddim);
+                                 const common::DDim& local_ddim);
 };
 
 }  // namespace dialect
diff --git a/paddle/fluid/pir/dialect/distributed/ir/type_storage.h b/paddle/fluid/pir/dialect/distributed/ir/type_storage.h
index 1f18573d3e162..05b09aa3ab4de 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/type_storage.h
+++ b/paddle/fluid/pir/dialect/distributed/ir/type_storage.h
@@ -33,10 +33,10 @@ struct DistDenseTensorTypeStorage : public pir::TypeStorage {
 
   DistDenseTensorTypeStorage(pir::DenseTensorType dense_tensor_type,
                              TensorDistAttribute tensor_dist_attr,
-                             const common::DDim& global_ddim)
+                             const common::DDim& local_ddim)
       : dense_tensor_type(dense_tensor_type),
         tensor_dist_attr(tensor_dist_attr),
-        global_ddim(global_ddim) {}
+        local_ddim(local_ddim) {}
 
   ///
   /// \brief Each derived TypeStorage must define a Construct method, which
@@ -53,10 +53,10 @@ struct DistDenseTensorTypeStorage : public pir::TypeStorage {
   static std::size_t HashValue(const ParamKey& key) {
     auto dense_tensor_type_hash = std::hash<pir::Type>()(std::get<0>(key));
     auto tensor_dist_attr_hash = std::hash<pir::Attribute>()(std::get<1>(key));
-    auto global_ddim_hash = std::hash<common::DDim>()(std::get<2>(key));
+    auto local_ddim_hash = std::hash<common::DDim>()(std::get<2>(key));
     auto value = pir::detail::hash_combine(dense_tensor_type_hash,
                                            tensor_dist_attr_hash);
-    return pir::detail::hash_combine(value, global_ddim_hash);
+    return pir::detail::hash_combine(value, local_ddim_hash);
   }
 
   ///
@@ -65,16 +65,16 @@ struct DistDenseTensorTypeStorage : public pir::TypeStorage {
   bool operator==(const ParamKey& key) const {
     return dense_tensor_type == std::get<0>(key) &&
            tensor_dist_attr == std::get<1>(key) &&
-           global_ddim == std::get<2>(key);
+           local_ddim == std::get<2>(key);
   }
 
   ///
   /// \brief DistDenseTensorTypeStorage include three parameters:
-  /// dense_tensor_type, tensor_dist_attr and global_ddim;
+  /// dense_tensor_type, tensor_dist_attr and local_ddim;
   ///
   pir::DenseTensorType dense_tensor_type;
   TensorDistAttribute tensor_dist_attr;
-  common::DDim global_ddim;
+  common::DDim local_ddim;
 };
 
 }  // namespace dialect
diff --git a/paddle/pir/include/core/builtin_type.h b/paddle/pir/include/core/builtin_type.h
index 3218707277a7a..144b62bb9753e 100644
--- a/paddle/pir/include/core/builtin_type.h
+++ b/paddle/pir/include/core/builtin_type.h
@@ -66,6 +66,15 @@ class IR_API DenseTensorType : public Type::TypeBase<DenseTensorType,
   DataLayout data_layout() const;
   const LoD &lod() const;
   size_t offset() const;
+
+  ///
+  /// \brief Implementation of 'classof' that compares the type id of
+  /// the provided value with the concrete type id.
+  ///
+  static bool classof(Type type);
+
+  static DenseTensorType dyn_cast_impl(Type type);
+
   static DenseTensorType get(IrContext *ctx,
                              Type dtype,
                              const Dim &dims,
diff --git a/paddle/pir/include/core/builtin_type_interfaces.h b/paddle/pir/include/core/builtin_type_interfaces.h
index d6425549fab1f..712a83efaa52a 100644
--- a/paddle/pir/include/core/builtin_type_interfaces.h
+++ b/paddle/pir/include/core/builtin_type_interfaces.h
@@ -137,6 +137,31 @@ class IR_API ShapedTypeInterface
   Concept *impl_;
 };
 
+class IR_API WrapTypeInterface : public TypeInterfaceBase<WrapTypeInterface> {
+ public:
+  struct Concept {
+    /// Defined these methods with the interface.
+    explicit Concept(Type (*prim_type)(Type)) : prim_type(prim_type) {}
+    Type (*prim_type)(Type);
+  };
+
+  template <class ConcreteType>
+  struct Model : public Concept {
+    static Type prim_type(Type type) {
+      return pir::cast<ConcreteType>(type).prim_type();
+    }
+    Model() : Concept(prim_type) {}
+  };
+
+  WrapTypeInterface(Type type, Concept *impl)
+      : TypeInterfaceBase<WrapTypeInterface>(type), impl_(impl) {}
+
+  Type prim_type() { return impl_->prim_type(*this); }
+
+ private:
+  Concept *impl_;
+};
 }  // namespace pir
 
 IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::ShapedTypeInterface)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::WrapTypeInterface)
diff --git a/paddle/pir/include/core/storage_manager_support.h b/paddle/pir/include/core/storage_manager_support.h
index b729a4480ac35..614f3938c54e2 100644
--- a/paddle/pir/include/core/storage_manager_support.h
+++ b/paddle/pir/include/core/storage_manager_support.h
@@ -90,7 +90,7 @@ class StorageHelperBase : public BaseT {
   ///
   template <typename T>
   static bool classof(T val) {
-    return val.type_id() == type_id();
+    return val && val.type_id() == type_id();
   }
 
   ///
diff --git a/paddle/pir/src/core/builtin_type.cc b/paddle/pir/src/core/builtin_type.cc
index 0da20a6b83bd1..96b83c8f6fe58 100644
--- a/paddle/pir/src/core/builtin_type.cc
+++ b/paddle/pir/src/core/builtin_type.cc
@@ -30,6 +30,25 @@ const DenseTensorType::LoD& DenseTensorType::lod() const {
 }
 
 size_t DenseTensorType::offset() const { return storage()->offset_; }
+bool DenseTensorType::classof(Type type) {
+  if (type) {
+    if (type.type_id() == type_id()) return true;
+    if (auto wrap_type = type.dyn_cast<WrapTypeInterface>()) {
+      return classof(wrap_type.prim_type());
+    }
+  }
+  return false;
+}
+DenseTensorType DenseTensorType::dyn_cast_impl(Type type) {
+  if (type) {
+    if (type.type_id() == type_id()) return DenseTensorType(type.storage());
+    if (auto wrap_type = type.dyn_cast<WrapTypeInterface>()) {
+      return dyn_cast_impl(wrap_type.prim_type());
+    }
+  }
+  return nullptr;
+}
+
 }  // namespace pir
 
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::UInt8Type)
diff --git a/paddle/pir/src/core/builtin_type_interfaces.cc b/paddle/pir/src/core/builtin_type_interfaces.cc
index 5b8d14b74175a..25ec38c709bef 100644
--- a/paddle/pir/src/core/builtin_type_interfaces.cc
+++ b/paddle/pir/src/core/builtin_type_interfaces.cc
@@ -27,3 +27,4 @@ pir::DDim ShapedTypeInterface::GetShape() const {
 
 }  // namespace pir
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::ShapedTypeInterface)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::WrapTypeInterface)
diff --git a/test/cpp/pir/distributed/dist_dialect_test.cc b/test/cpp/pir/distributed/dist_dialect_test.cc
index 4969a25c5cfd3..31bf69ea77030 100644
--- a/test/cpp/pir/distributed/dist_dialect_test.cc
+++ b/test/cpp/pir/distributed/dist_dialect_test.cc
@@ -128,6 +128,40 @@ TEST(dist_dense_tensor_type_test, base) {
   EXPECT_EQ(dist_densor_type.local_ddim(), dims);
 }
 
+TEST(dist_dense_tensor_type_test, warp_type_interface) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<DistDialect>();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  phi::distributed::ProcessMesh process_mesh(
+      mesh_shape, process_ids, dim_names);
+  auto mesh_attr = ProcessMeshAttribute::get(ctx, process_mesh);
+
+  std::vector<int64_t> dims_mapping = {0, -1};
+  paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status{
+      {1, phi::ReduceType::kRedSum}};
+  // construct a TensorDistAttribute.
+  auto tensor_dist_attr =
+      TensorDistAttribute::get(ctx, mesh_attr, dims_mapping, partial_status);
+
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  common::DDim dims = {2, 2};
+  common::DataLayout data_layout = common::DataLayout::NCHW;
+  pir::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+  pir::DenseTensorType dense_tensor_type = pir::DenseTensorType::get(
+      ctx, fp32_dtype, dims, data_layout, lod, offset);
+
+  pir::Type dist_densor_type =
+      DistDenseTensorType::get(ctx, dense_tensor_type, tensor_dist_attr, dims);
+
+  EXPECT_TRUE(dist_densor_type.isa<pir::DenseTensorType>());
+  EXPECT_EQ(dist_densor_type.dyn_cast<pir::DenseTensorType>(),
+            dense_tensor_type);
+}
+
 TEST(operation_dist_attr_test, base) {
   pir::IrContext* ctx = pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<DistDialect>();

From ca0a28580a50b29b16251fa21085375289652bcc Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Tue, 5 Mar 2024 23:24:42 +0800
Subject: [PATCH 002/114] [PIR] [DyShape] Fix cinn_reshape with case shape
 including 0 (#62415)

* fix cinn_reshape

* bugfix
---
 .../infer_symbolic_shape/cinn_op_infer_sym.cc | 59 +++++++++++++++++--
 1 file changed, 55 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index 932012bf0622f..34dd2821d3fc4 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -125,10 +125,61 @@ bool ReshapeOpInferSymbolicShape(
   std::vector<int> shape =
       paddle::dialect::details::GetVectorAttr<int>(op, "shape");
 
-  std::vector<symbol::DimExpr> out_dims;
-  for (int dim : shape) {
-    out_dims.emplace_back(static_cast<std::int64_t>(dim));
-  }
+  const auto &GetProduct = [&](const auto &dim_exprs, const auto &Filter) {
+    symbol::DimExpr product{1};
+    for (const auto &dim_expr : dim_exprs) {
+      if (Filter(dim_expr)) {
+        product = product * dim_expr;
+      }
+    }
+    return product;
+  };
+
+  const auto &IsNotMinusOne = [&](const symbol::DimExpr &dim_expr) {
+    if (dim_expr.isa<int64_t>()) {
+      return dim_expr.dyn_cast<int64_t>() != static_cast<int64_t>(-1);
+    }
+    return true;
+  };
+
+  const auto &IsZero = [&](const symbol::DimExpr &dim_expr) {
+    if (dim_expr.isa<int64_t>()) {
+      return dim_expr.dyn_cast<int64_t>() == static_cast<int64_t>(0);
+    }
+    return false;
+  };
+
+  const auto &target_shape = [&] {
+    std::vector<symbol::DimExpr> target_shape;
+    for (int dim : shape) {
+      target_shape.emplace_back(static_cast<std::int64_t>(dim));
+    }
+    return target_shape;
+  }();
+
+  const auto &original_shape =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0)).shape();
+
+  const auto &out_dims = [&] {
+    const auto &numel =
+        GetProduct(original_shape, [](const auto &) { return true; });
+
+    const auto &product_exclude_minus_one =
+        GetProduct(target_shape, IsNotMinusOne);
+
+    std::vector<symbol::DimExpr> out_dims;
+    out_dims.reserve(target_shape.size());
+    for (size_t i = 0; i < target_shape.size(); ++i) {
+      auto out_dim_expr = IsNotMinusOne(target_shape[i])
+                              ? target_shape[i]
+                              : (numel / product_exclude_minus_one);
+      out_dim_expr = IsZero(target_shape[i]) ? original_shape[i] : out_dim_expr;
+      out_dims.emplace_back(out_dim_expr);
+    }
+
+    return out_dims;
+  }();
+
   symbol::ShapeOrDataDimExprs shape_data{
       symbol::TensorShapeOrDataDimExprs(out_dims)};
   shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);

From 5a7828bdd9f82489eb493dcb435bd7465a3654b4 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Tue, 5 Mar 2024 23:25:46 +0800
Subject: [PATCH 003/114] llama group: add llama group (#62325)

* add llama log softmax subgraph

* add swiglu test case

* fix code

* fix code
---
 test/ir/pir/cinn/symbolic/CMakeLists.txt      |   2 +-
 .../symbolic/test_llama_group_log_softmax.py  | 120 ++++++++++++++++++
 .../cinn/symbolic/test_llama_group_swiglu.py  |  84 ++++++++++++
 3 files changed, 205 insertions(+), 1 deletion(-)
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_group_log_softmax.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_group_swiglu.py

diff --git a/test/ir/pir/cinn/symbolic/CMakeLists.txt b/test/ir/pir/cinn/symbolic/CMakeLists.txt
index 3349cddf6c34d..97d918e0832b1 100644
--- a/test/ir/pir/cinn/symbolic/CMakeLists.txt
+++ b/test/ir/pir/cinn/symbolic/CMakeLists.txt
@@ -32,7 +32,7 @@ if(WITH_GPU)
         ${CMAKE_COMMAND} -E env
         PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
         FLAGS_enable_pir_api=1 FLAGS_cinn_bucket_compile=True
-        FLAGS_pir_apply_shape_optimization_pass=1
+        FLAGS_prim_enable_dynamic=true FLAGS_pir_apply_shape_optimization_pass=1
         FLAGS_group_schedule_tiling_first=1 FLAGS_cinn_new_group_scheduler=1
         ${PYTHON_EXECUTABLE}
         ${CMAKE_CURRENT_SOURCE_DIR}/${cinn_pir_test_name}.py
diff --git a/test/ir/pir/cinn/symbolic/test_llama_group_log_softmax.py b/test/ir/pir/cinn/symbolic/test_llama_group_log_softmax.py
new file mode 100644
index 0000000000000..a99808951389e
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_group_log_softmax.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.base import core
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+sys.path.append("../")
+import utils
+
+
+def update_scores_for_generation(
+    scores, next_scores, length, unfinished_flag=None
+):
+    # update scores
+
+    unfinished_scores = (scores * length + next_scores) / (length + 1)
+    return unfinished_scores
+
+
+def tmp(logits, scores, next_tokens, length):
+    origin_probs = F.log_softmax(logits)  # [-1,32000], f16
+
+    # compute next_tokens
+    # logits = logits / temperature
+    # top_ps_tensor = paddle.full(shape=[paddle.shape(probs)[0], 1], fill_value=top_p, dtype=probs.dtype)
+    # _, next_tokens = paddle.tensor.top_p_sampling(probs, top_ps_tensor)
+
+    next_scores = paddle.index_sample(
+        origin_probs, next_tokens
+    )  # (pd_op.tensor<-1x32000xf16>, pd_op.tensor<-1x1xi64>) -> pd_op.tensor<-1x1xf16>
+    scores = update_scores_for_generation(scores, next_scores, length)
+    return scores
+
+
+class TestGroupOpNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, scores, next_tokens, length):
+        # "O" represents COPY semantics.
+        out = tmp(x, scores, next_tokens, length)
+        return out
+
+
+class TestGroupOp(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.shape1 = [1, 32000]
+        self.x = paddle.randn(self.shape1, dtype="float16")
+        self.x.stop_gradient = False
+        self.score_s = [1, 1]
+        self.score = paddle.randn(self.score_s, dtype="float16")
+        self.score.stop_gradient = False
+
+        self.shape2 = [1, 1]
+        self.y = paddle.full(self.shape2, 1, dtype="int64")
+        self.y.stop_gradient = False
+        self.shape3 = [1]
+        self.z = paddle.full(self.shape3, 1, dtype="int64")
+        self.z.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn=False, mode="jit"):
+        net = TestGroupOpNet()
+        if mode == "eager":
+            out = net(self.x, self.score, self.y, self.z)
+        else:
+            input_spec = [
+                InputSpec(shape=[None, 32000], dtype="float16"),
+                InputSpec(shape=[None, 1], dtype="float16"),
+                InputSpec(shape=[None, 1], dtype="int64"),
+                InputSpec(shape=[1], dtype="int64"),
+            ]
+            net = utils.apply_to_static(net, use_cinn, input_spec)
+            net.eval()
+            out = net(self.x, self.score, self.y, self.z)
+            if use_cinn:
+                self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(mode="eager")
+        core._set_prim_all_enabled(True)
+        # cinn_out = self.eval(use_cinn=utils.unittest_use_cinn())
+        cinn_out = self.eval(use_cinn=False)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        )
+        core._set_prim_all_enabled(True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_llama_group_swiglu.py b/test/ir/pir/cinn/symbolic/test_llama_group_swiglu.py
new file mode 100644
index 0000000000000..ebb09be9cadb0
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_group_swiglu.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.base import core
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+sys.path.append("../")
+
+
+import utils
+
+
+class TransposeReshapeNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        out = paddle.incubate.nn.functional.swiglu(x, y)
+
+        return out
+
+
+class TestTransposeReshape(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn([4, 32, 11008], dtype="float16")
+        self.y = paddle.randn([4, 32, 11008], dtype="float16")
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn=False, mode="jit"):
+        net = TransposeReshapeNet()
+        if mode == "eager":
+            out = out = net(self.x, self.y)
+        else:
+            input_spec = [
+                InputSpec(shape=[None, None, 11008], dtype="float16"),
+                InputSpec(shape=[None, None, 11008], dtype="float16"),
+            ]
+            net = utils.apply_to_static(net, use_cinn, input_spec)
+            net.eval()
+            out = net(self.x, self.y)
+            if use_cinn:
+                self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(mode="eager")
+        core._set_prim_all_enabled(True)
+        # cinn_out = self.eval(use_cinn=utils.unittest_use_cinn())
+        cinn_out = self.eval(use_cinn=False)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-2, rtol=1e-2
+        )
+        core._set_prim_all_enabled(False)
+
+
+if __name__ == '__main__':
+    unittest.main()

From fa07d311a7c4e91b5ba62257440be1e5ef578e35 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Wed, 6 Mar 2024 09:40:44 +0800
Subject: [PATCH 004/114] fix JetPack_bug (#62426)

---
 python/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index fcd93656b30b3..375e8308e5d0a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -192,6 +192,7 @@ add_custom_target(paddle_python ALL
 if(BUILD_WHL_PACKAGE AND NOT WITH_SETUP_INSTALL)
   add_custom_target(paddle_copy ALL
                     DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp_wheel)
+  add_dependencies(paddle_copy paddle_python)
 endif()
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)

From 6bb3ae51ce5370687f3f798cf4711bec238a7732 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 6 Mar 2024 09:46:48 +0800
Subject: [PATCH 005/114] support pd silce op 0D to 1D (#62442)

---
 .../group_merge/convert_0d_to_1d_pass.cc      | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.cc
index de8383bd107f1..588312cc80114 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.cc
@@ -61,6 +61,27 @@ class FullOpPattern : public pir::OpRewritePattern<paddle::dialect::FullOp> {
   }
 };
 
+class SliceOpPattern : public pir::OpRewritePattern<paddle::dialect::SliceOp> {
+ public:
+  using pir::OpRewritePattern<paddle::dialect::SliceOp>::OpRewritePattern;
+
+  bool Match(paddle::dialect::SliceOp op) const override {
+    const auto& tensor_type =
+        op.result(0).type().dyn_cast<pir::DenseTensorType>();
+
+    return tensor_type.dims().size() == 0;
+  }
+
+  void Rewrite(paddle::dialect::SliceOp op,
+               pir::PatternRewriter& rewriter) const override {
+    std::vector<pir::Attribute> vec_dims;
+    pir::Attribute attr_dims =
+        pir::ArrayAttribute::get(pir::IrContext::Instance(), vec_dims);
+
+    op->set_attribute("decrease_axis", attr_dims);
+  }
+};
+
 class SumOpPattern : public pir::OpRewritePattern<paddle::dialect::SumOp> {
  public:
   using pir::OpRewritePattern<paddle::dialect::SumOp>::OpRewritePattern;
@@ -188,6 +209,7 @@ class Convert0DTo1DPass : public pir::Pass {
     ps.Add<CombineOpPattern>(context);
     ps.Add<SumOpPattern>(context);
     ps.Add<WhileOpPattern>(context);
+    ps.Add<SliceOpPattern>(context);
     patterns_ = pir::FrozenRewritePatternSet(std::move(ps));
     return true;
   }

From 0d98d15fd5289bccce5eb47d8551676ffa78fcfc Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Wed, 6 Mar 2024 10:09:48 +0800
Subject: [PATCH 006/114] [SOT] Always generate `false_fn` when `POP_JUMP_*`
 breakgraph (#62424)

---
 .../opcode_translator/executor/opcode_executor.py    | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index 0d832c3b5cf85..40a4c3ae62460 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -1791,8 +1791,13 @@ def _break_graph_when_if(self, result: TensorVariable, instr: Instruction):
         stack_size_after_if = len(self.stack) - 1
 
         # 2. create true_fn and false_fn
-        def create_if_branch_fn(start_idx, input_var_names):
-            if self._instructions[start_idx].opname == "RETURN_VALUE":
+        def create_if_branch_fn(start_idx, input_var_names, is_pop_jump_branch):
+            # JUMP_IF_* maybe jump to the RETURN_VALUE, we should skip this case
+            # We shouldn't skip POP_JUMP_* case, because it will cause the stack size to be incorrect
+            if (
+                self._instructions[start_idx].opname == "RETURN_VALUE"
+                and not is_pop_jump_branch
+            ):
                 return None
             pycode_gen = PyCodeGen(self._frame)
             origin_instrs = get_instructions(pycode_gen._origin_code)
@@ -1815,6 +1820,7 @@ def create_if_branch_fn(start_idx, input_var_names):
         true_fn = create_if_branch_fn(
             start_idx=true_fn_start_index,
             input_var_names=true_fn_input_var_names,
+            is_pop_jump_branch=False,
         )
 
         false_fn_read_names, _ = analysis_used_names(
@@ -1827,6 +1833,7 @@ def create_if_branch_fn(start_idx, input_var_names):
         false_fn = create_if_branch_fn(
             start_idx=false_fn_start_index,
             input_var_names=false_fn_input_var_names,
+            is_pop_jump_branch=instr.opname.startswith("POP_JUMP"),
         )
 
         # 4. setup vars which is created in loop as Undefind
@@ -1881,6 +1888,7 @@ def create_if_branch_fn(start_idx, input_var_names):
         else:
             false_start_code = self._graph.pycode_gen.gen_return()
 
+        # Replace the jump instruction with the new if structure
         if_code.jump_to = false_start_code
 
         self.new_code = self._graph.pycode_gen.gen_pycode()

From f4b6eeabb56d5cee8ed74f0b2f53b50ba0eb680a Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Wed, 6 Mar 2024 10:15:05 +0800
Subject: [PATCH 007/114] add cinn mode check (#62418)

---
 python/paddle/base/framework.py       | 15 ++++++++++++++-
 test/ir/pir/test_pir_executor_flag.py | 13 ++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 5d3801dcddf2e..a306004bca62a 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -340,7 +340,7 @@ def in_dynamic_or_pir_mode():
 def in_pir_executor_mode():
     """
 
-    This API checks whether paddle runs iin pir executor mode.
+    This API checks whether paddle runs in pir executor mode.
 
     Returns:
         bool: Whether paddle runs in pir executor mode.
@@ -350,6 +350,19 @@ def in_pir_executor_mode():
     return flag in ("true", "1")
 
 
+def in_cinn_mode():
+    """
+
+    This API checks whether paddle runs in cinn mode.
+
+    Returns:
+        bool: Whether paddle runs in cinn mode.
+
+    """
+    flag = str(os.environ.get("FLAGS_use_cinn")).lower()
+    return flag in ("true", "1")
+
+
 global_ipu_index = -1
 global_ipu_stage = -1
 ipu_index_attr_name = 'ipu_index'
diff --git a/test/ir/pir/test_pir_executor_flag.py b/test/ir/pir/test_pir_executor_flag.py
index b8fd5e09700bc..7a79a68302f79 100644
--- a/test/ir/pir/test_pir_executor_flag.py
+++ b/test/ir/pir/test_pir_executor_flag.py
@@ -15,15 +15,22 @@
 import os
 import unittest
 
-from paddle.base.framework import in_pir_executor_mode
+from paddle.base.framework import in_cinn_mode, in_pir_executor_mode
 
 
-class TestPrimFlags(unittest.TestCase):
-    def test_prim_flags(self):
+class TestPIRModeFlags(unittest.TestCase):
+    def test_pir_mode_flags(self):
         self.assertTrue(in_pir_executor_mode())
         os.environ["FLAGS_enable_pir_in_executor"] = "false"
         self.assertFalse(in_pir_executor_mode())
 
 
+class TestCinnModeFlags(unittest.TestCase):
+    def test_cinn_mode_flags(self):
+        self.assertFalse(in_cinn_mode())
+        os.environ["FLAGS_use_cinn"] = "true"
+        self.assertTrue(in_cinn_mode())
+
+
 if __name__ == '__main__':
     unittest.main()

From 68bfa8691bc259df68d7360ca33ea999c31bb389 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 6 Mar 2024 10:36:30 +0800
Subject: [PATCH 008/114] [PIR+CINN]Add Llama2 subgraph for backend test
 (#62313)

* [PIR+CINN]Add Llama2 subgraph for backend test

* add 2 subgraph

* add more UT

* add more UT

* add more UT

* fix zip

* disable
---
 .../symbolic/test_llama_concat_slice_scale.py |  83 ++++++++++++
 .../pir/cinn/symbolic/test_llama_multi_add.py |  91 +++++++++++++
 .../symbolic/test_llama_pow_sum_divide.py     |  93 +++++++++++++
 .../cinn/symbolic/test_llama_slice_concat.py  | 126 ++++++++++++++++++
 .../symbolic/test_llama_transpose_reshape.py  | 125 +++++++++++++++++
 .../symbolic/test_llama_unsqueeze_expand.py   |  84 ++++++++++++
 .../cinn/symbolic/test_reshape_zero_shape.py  |  76 +++++++++++
 7 files changed, 678 insertions(+)
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_concat_slice_scale.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_multi_add.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_pow_sum_divide.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_slice_concat.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_transpose_reshape.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_unsqueeze_expand.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_reshape_zero_shape.py

diff --git a/test/ir/pir/cinn/symbolic/test_llama_concat_slice_scale.py b/test/ir/pir/cinn/symbolic/test_llama_concat_slice_scale.py
new file mode 100644
index 0000000000000..f50500ff2a35f
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_concat_slice_scale.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class ConcatSliceScaleNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        x_shape = paddle.shape(x)
+        # Use 'y' to generate 'cond' and 'right' to avoid
+        # usless operations in paddle.where api.
+        cond = y.cast(dtype="bool")
+        right = y
+
+        z = paddle.where(cond, y, right)
+        out0 = paddle.concat([x, z], axis=1)
+        out1 = out0[x_shape[1] :]
+        out2 = out1 * 1
+        return out2
+
+
+class TestConcatSliceScale(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randint(0, 100, [32, 128], dtype="int64")
+        self.y = paddle.randint(0, 100, [32, 1], dtype="int64")
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = ConcatSliceScaleNet()
+        input_spec = [
+            InputSpec(shape=[None, None], dtype="int64"),
+            InputSpec(shape=[None, 1], dtype="int64"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x, self.y)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        pass
+        # dy_out = self.eval(use_cinn=False)
+        # if utils.unittest_use_cinn():
+        #     cinn_out = self.eval(use_cinn=True)
+        #     np.testing.assert_allclose(
+        #         cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        #     )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_llama_multi_add.py b/test/ir/pir/cinn/symbolic/test_llama_multi_add.py
new file mode 100644
index 0000000000000..655eb11f89f88
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_multi_add.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class MultiAddNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        shape = paddle.shape(x)
+        mask = paddle.full(shape, 0, dtype="bool")
+
+        x1 = paddle.full([1], 0, dtype="float64")
+        x2 = paddle.full([1], -65504, dtype="float64")
+        x3 = paddle.full([1], 0, dtype="float64")
+        x4 = paddle.full([1], 0, dtype="float64")
+
+        y = mask.cast("float64")
+        z = x.cast("float64")
+
+        s0 = x3 + x4
+        s1 = s0 + y
+        s2 = x1 + s1
+        s3 = x2 + s1
+        s4 = (z + s1).cast("bool")
+
+        return s2, s3, s4
+
+
+class TestMultiAdd(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randint(0, 1, [64, 1, 32, 128], dtype="int64").astype(
+            "bool"
+        )
+        self.x.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = MultiAddNet()
+        input_spec = [InputSpec(shape=[None, 1, None, None], dtype="bool")]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_outs = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_outs = self.eval(use_cinn=True)
+            for dy_out, cinn_out in zip(dy_outs, cinn_outs):
+                np.testing.assert_allclose(
+                    cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+                )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_llama_pow_sum_divide.py b/test/ir/pir/cinn/symbolic/test_llama_pow_sum_divide.py
new file mode 100644
index 0000000000000..8817eadf74835
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_pow_sum_divide.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class PowSumDivideNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y, z, w):
+        s0 = paddle.shape(y)
+        s1 = paddle.shape(x)[1].reshape([1])
+
+        shape = paddle.concat([s0, s1])
+        out0 = paddle.reshape(z, shape).cast("float32")
+
+        out1 = out0.pow(2)
+        out2 = out1.sum(axis=2, keepdim=True)
+        factor = paddle.full([1], 4096, dtype="float32")
+        out3 = out2.divide(factor)
+        out4 = out3 + 1e-6
+        out5 = out4.pow(-0.5)
+        out6 = out5.multiply(out0).cast("float16")
+        out7 = out6.multiply(w)
+
+        return out7
+
+
+class TestPowSumDivide(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn([64, 4096], dtype="float16")
+        self.y = paddle.randint(0, 100, [64, 2], dtype="int64")
+        self.z = paddle.randn([64, 8192], dtype="float16")
+        self.w = paddle.randn([4096], dtype="float16")
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = PowSumDivideNet()
+        input_spec = [
+            InputSpec(shape=[None, 4096], dtype="float16"),
+            InputSpec(shape=[None, None], dtype="int64"),
+            InputSpec(shape=[None, 4096], dtype="float16"),
+            InputSpec(shape=[4096], dtype="float16"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x, self.y, self.z, self.w)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_llama_slice_concat.py b/test/ir/pir/cinn/symbolic/test_llama_slice_concat.py
new file mode 100644
index 0000000000000..595a406304bd3
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_slice_concat.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class SliceMultiConcatNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x0 = paddle.shape(x)[0].reshape([1])
+        x1 = paddle.full([1], 1, dtype="int32")
+        out0 = paddle.concat([x0, x1])
+
+        y = paddle.full([1], 1, dtype="int32")
+        out1 = paddle.concat([x0, y])
+        return out0, out1
+
+
+class TestSliceMultiConcat(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.shape = [64, 128]
+        self.x = paddle.randint(0, 100, self.shape, dtype="int64")
+        self.x.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = SliceMultiConcatNet()
+        input_spec = [
+            InputSpec(shape=[None, None], dtype="int64"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_outs = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_outs = self.eval(use_cinn=True)
+            for dy_out, cinn_out in zip(dy_outs, cinn_outs):
+                np.testing.assert_allclose(
+                    cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+                )
+
+
+class SliceConcatNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x0 = paddle.shape(x)[0].reshape([1])
+        x1 = paddle.full([1], 1, dtype="int32")
+        out = paddle.concat([x0, x1])
+        return out
+
+
+class TestSliceConcat(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn([1, 32000], dtype="float16")
+        self.x.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = SliceConcatNet()
+        input_spec = [
+            InputSpec(shape=[None, 32000], dtype="float16"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_llama_transpose_reshape.py b/test/ir/pir/cinn/symbolic/test_llama_transpose_reshape.py
new file mode 100644
index 0000000000000..4bcedd5625c39
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_transpose_reshape.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class TransposeReshapeNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        y_shape = paddle.shape(y)
+        s0 = y_shape[0]
+        s1 = y_shape[1]
+        s2 = 4096
+        y = paddle.transpose(x, [0, 2, 1, 3])
+        out = paddle.reshape(y, [s0, s1, s2])
+
+        return out
+
+
+class TestTransposeReshape(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn([4, 32, 128, 128], dtype="float16")
+        self.y = paddle.randn([4, 128, 32, 128], dtype="float16")
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = TransposeReshapeNet()
+        input_spec = [
+            InputSpec(shape=[None, 32, None, None], dtype="float16"),
+            InputSpec(shape=[None, None, 32, 128], dtype="float16"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x, self.y)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+class ReshapeTransposeNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        y = paddle.reshape(x, [0, 0, 32, 128])
+        out = paddle.transpose(y, [0, 2, 1, 3])
+
+        return out
+
+
+class TestReshapeTranspose(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randn([4, 16, 4096], dtype="float16")
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = ReshapeTransposeNet()
+        input_spec = [
+            InputSpec(shape=[None, None, 4096], dtype="float16"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_llama_unsqueeze_expand.py b/test/ir/pir/cinn/symbolic/test_llama_unsqueeze_expand.py
new file mode 100644
index 0000000000000..819aedcd871c9
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_unsqueeze_expand.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class UnsqueezeExpandNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        s0 = paddle.shape(x)[0]
+        s1 = 1
+        s2 = paddle.shape(y)[0]
+        s3 = paddle.shape(x)[1]
+
+        z = x.unsqueeze([1, 2]).cast(bool)
+        z.stop_gradient = True
+        out = paddle.expand(z, [s0, s1, s2, s3])
+        return out
+
+
+class TestUnsqueezeExpand(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.x = paddle.randint(0, 100, [64, 128], dtype="int64")
+        self.x.stop_gradient = False
+        self.y = paddle.randint(0, 100, [64, 32], dtype="int64")
+        self.y.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = UnsqueezeExpandNet()
+        input_spec = [
+            InputSpec(shape=[None, None], dtype="int64"),
+            InputSpec(shape=[None, None], dtype="int64"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x, self.y)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_reshape_zero_shape.py b/test/ir/pir/cinn/symbolic/test_reshape_zero_shape.py
new file mode 100644
index 0000000000000..be99e8b1b69e6
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_reshape_zero_shape.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class ReshapeZeroShapeNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        # "O" represents COPY semantics.
+        out = paddle.reshape(x, shape=[0, 0, 32, 128])
+        return out
+
+
+class TestReshapeZeroShape(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.shape = [4, 4, 4096]
+        self.x = paddle.randn(self.shape, dtype="float32")
+        self.x.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = ReshapeZeroShapeNet()
+        input_spec = [
+            InputSpec(shape=[None, None, 4096], dtype="float32"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2e1899e1f8023c062674f0482305719b2f8811fa Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Wed, 6 Mar 2024 11:26:12 +0800
Subject: [PATCH 009/114] sharding supports reduce_avg communication (#62147)

---
 .../framework/distributed_strategy.proto      |  1 +
 paddle/phi/core/distributed/nccl_tools.cc     | 13 +++---
 .../distributed/communication/all_reduce.py   | 19 +++++++-
 .../distributed/communication/reduce.py       | 20 ++++++++-
 .../communication/reduce_scatter.py           | 19 +++++++-
 .../dygraph_sharding_optimizer.py             | 19 +++++++-
 .../fleet/utils/tensor_fusion_helper.py       | 29 +++++++++++--
 .../dygraph_group_sharded_stage1_fp16.py      | 43 +++++++++++++++++++
 8 files changed, 150 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 58460fcf9064b..6cc52fba01236 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -91,6 +91,7 @@ message DygraphShardingConfig {
   optional bool comm_overlap = 3 [ default = false ];
   optional bool split_param = 4 [ default = false ];
   optional bool fuse_optimizer = 5 [ default = true ];
+  optional bool use_reduce_avg = 6 [ default = true ];
 }
 
 message HybridConfig {
diff --git a/paddle/phi/core/distributed/nccl_tools.cc b/paddle/phi/core/distributed/nccl_tools.cc
index a5388796d1f45..d79466922976a 100644
--- a/paddle/phi/core/distributed/nccl_tools.cc
+++ b/paddle/phi/core/distributed/nccl_tools.cc
@@ -29,17 +29,20 @@ namespace distributed {
 
 ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
   static const std::unordered_map<ReduceOp, ncclRedOp_t> red_type = {
-      {ReduceOp::MIN, ncclMin},
-      {ReduceOp::MAX, ncclMax},
-      {ReduceOp::SUM, ncclSum},
-      {ReduceOp::PRODUCT, ncclProd},
+    {ReduceOp::MIN, ncclMin},
+    {ReduceOp::MAX, ncclMax},
+    {ReduceOp::SUM, ncclSum},
+    {ReduceOp::PRODUCT, ncclProd},
+#if NCCL_VERSION_CODE >= 21000
+    {ReduceOp::AVG, ncclAvg},
+#endif
   };
   auto it = red_type.find(reduction);
   PADDLE_ENFORCE_EQ(it != red_type.end(),
                     true,
                     phi::errors::InvalidArgument(
                         "Invalid nccl reduction. Must be ncclMin | ncclMax | "
-                        "ncclProd | ncclSum"));
+                        "ncclProd | ncclSum | ncclAvg."));
   return it->second;
 }
 
diff --git a/python/paddle/distributed/communication/all_reduce.py b/python/paddle/distributed/communication/all_reduce.py
index 1ed26315a5d28..bef362a43cb7c 100644
--- a/python/paddle/distributed/communication/all_reduce.py
+++ b/python/paddle/distributed/communication/all_reduce.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 from paddle.distributed.communication import stream
 from paddle.distributed.communication.reduce import ReduceOp
 
@@ -32,7 +33,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
     Args:
         tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
             should be float16, float32, float64, int32, int64, int8, uint8 or bool.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD|ReduceOp.AVG, optional): The operation used. Default value is ReduceOp.SUM.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Wether this op is a sync op. Default value is True.
 
@@ -55,6 +56,22 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
             >>> print(data)
             >>> # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
     """
+    # AVG is only supported when nccl >= 2.10
+    if op == ReduceOp.AVG and paddle.base.core.nccl_version() < 21000:
+        group = (
+            paddle.distributed.collective._get_global_group()
+            if group is None
+            else group
+        )
+        tensor.scale_(1.0 / group.nranks)
+        return stream.all_reduce(
+            tensor,
+            op=ReduceOp.SUM,
+            group=group,
+            sync_op=sync_op,
+            use_calc_stream=False,
+        )
+
     return stream.all_reduce(
         tensor, op=op, group=group, sync_op=sync_op, use_calc_stream=False
     )
diff --git a/python/paddle/distributed/communication/reduce.py b/python/paddle/distributed/communication/reduce.py
index e3c8d9bc13aa4..5ddffbda4c73b 100644
--- a/python/paddle/distributed/communication/reduce.py
+++ b/python/paddle/distributed/communication/reduce.py
@@ -65,6 +65,8 @@ def _get_reduce_op(reduce_op, func_name):
             return framework.core.ReduceOp.MIN
         elif reduce_op == ReduceOp.PROD:
             return framework.core.ReduceOp.PRODUCT
+        elif reduce_op == ReduceOp.AVG:
+            return framework.core.ReduceOp.AVG
     else:
         if reduce_op == ReduceOp.SUM:
             return f'c_{func_name}_sum'
@@ -96,7 +98,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
         tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
             should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD|ReduceOp.AVG, optional): The operation used. Default value is ReduceOp.SUM.
         group (Group, optional): The group instance return by new_group or None for global default group.
         sync_op (bool, optional): Whether this op is a sync op. The default value is True.
 
@@ -120,6 +122,22 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
             >>> # [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
             >>> # [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
     """
+    # AVG is only supported when nccl >= 2.10
+    if op == ReduceOp.AVG and paddle.base.core.nccl_version() < 21000:
+        group = (
+            paddle.distributed.collective._get_global_group()
+            if group is None
+            else group
+        )
+        tensor.scale_(1.0 / group.nranks)
+        return stream.reduce(
+            tensor,
+            dst=dst,
+            op=ReduceOp.SUM,
+            group=group,
+            sync_op=sync_op,
+            use_calc_stream=False,
+        )
     return stream.reduce(
         tensor,
         dst=dst,
diff --git a/python/paddle/distributed/communication/reduce_scatter.py b/python/paddle/distributed/communication/reduce_scatter.py
index 0265e0a0b52c6..8513d79f8c7fa 100644
--- a/python/paddle/distributed/communication/reduce_scatter.py
+++ b/python/paddle/distributed/communication/reduce_scatter.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 from paddle.distributed.communication import stream
 from paddle.distributed.communication.reduce import ReduceOp
 from paddle.distributed.communication.stream.reduce_scatter import (
@@ -30,7 +31,7 @@ def reduce_scatter(
             float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
         tensor_list (List[Tensor]]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
             should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD|ReduceOp.AVG, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
         group (Group, optional): Communicate in which group. If none is given, use the global group as default.
         sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
 
@@ -61,6 +62,22 @@ def reduce_scatter(
             >>> # [8, 10] (2 GPUs, out for rank 1)
 
     """
+    # AVG is only supported when nccl >= 2.10
+    if op == ReduceOp.AVG and paddle.base.core.nccl_version() < 21000:
+        group = (
+            paddle.distributed.collective._get_global_group()
+            if group is None
+            else group
+        )
+        tensor.scale_(1.0 / group.nranks)
+        return stream.reduce_scatter(
+            tensor,
+            tensor_list,
+            op=ReduceOp.SUM,
+            group=group,
+            sync_op=sync_op,
+            use_calc_stream=False,
+        )
     return stream.reduce_scatter(
         tensor,
         tensor_list,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index fef3f878c2e97..eb09eb66ae353 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -23,6 +23,7 @@
 from paddle.base.dygraph import base as imperative_base
 from paddle.base.framework import EagerParamBase
 from paddle.distributed import fleet
+from paddle.distributed.communication.reduce import ReduceOp
 
 from ...utils.log_util import logger
 from ...utils.tensor_fusion_helper import (
@@ -97,6 +98,16 @@ def __init__(self, optimizer, hcg):
         self.fuse_optimizer = strategy.hybrid_configs[
             'sharding_configs'
         ].fuse_optimizer
+        self.use_reduce_avg = strategy.hybrid_configs[
+            'sharding_configs'
+        ].use_reduce_avg
+        if self.use_reduce_avg and paddle.base.core.nccl_version() < 21000:
+            self.use_reduce_avg = False
+            warnings.warn(
+                "nccl reduce_avg requires nccl>=2.10.0, but current version is %s"
+                % paddle.base.core.nccl_version()
+            )
+
         pp_overlap = strategy.hybrid_configs['pp_configs'].sharding_comm_overlap
         if self.tensor_fusion or self.comm_overlap:
             assert (
@@ -207,6 +218,7 @@ def _tensor_fusion(self):
                 acc_step=self.accumulate_steps,
                 scale_after_comm=False,
                 apply_decay_param_fun=self.origin_decay_param_fun,
+                use_reduce_avg=self.use_reduce_avg,
             )
             if self.comm_overlap:
                 self._comm_buffers += all_buffer
@@ -281,7 +293,6 @@ def reduce_gradients(self, parameter_list, hcg):
                 buffer.scale_grads()
             return
         with framework.no_grad():
-            sharding_nrank = hcg.get_sharding_parallel_group().nranks
             for param in parameter_list:
                 g_var = None
                 if param.trainable and (param._grad_ivar() is not None):
@@ -292,11 +303,14 @@ def reduce_gradients(self, parameter_list, hcg):
                     ), "param.grad should be None when using main_grad"
                     g_var = param.main_grad
                 if g_var is not None:
-                    g_var.scale_(1.0 / sharding_nrank)
+                    reduce_op = (
+                        ReduceOp.AVG if self.use_reduce_avg else ReduceOp.SUM
+                    )
                     param_rank = self._param2rank[param.name]
                     if not g_shard_use_reduce:
                         paddle.distributed.all_reduce(
                             g_var,
+                            op=reduce_op,
                             group=hcg.get_sharding_parallel_group(),
                             sync_op=True,
                         )
@@ -307,6 +321,7 @@ def reduce_gradients(self, parameter_list, hcg):
                             dst=hcg.get_sharding_parallel_group().ranks[
                                 param_rank
                             ],
+                            op=reduce_op,
                             group=hcg.get_sharding_parallel_group(),
                             sync_op=True,
                         )
diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
index 4be5a5d2d27ee..82bf2ce38b2e4 100644
--- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
+++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
@@ -352,6 +352,7 @@ def __init__(
         fuse_param=False,
         scale_after_comm=True,
         release_grads=False,
+        use_reduce_avg=False,
     ):
         self._id = id
         self._params = params
@@ -360,6 +361,7 @@ def __init__(
         self._scale_after_comm = scale_after_comm
         self._fuse_param = fuse_param
         self._release_grads = release_grads
+        self._use_reduce_avg = use_reduce_avg
 
         assert not (
             self._fuse_param and self._release_grads
@@ -573,19 +575,29 @@ def comm_grads(self):
 
     @imperative_base.no_grad
     def _comm_grads(self):
-        if not self._scale_after_comm:
+        reduce_op = (
+            paddle.distributed.ReduceOp.AVG
+            if self._use_reduce_avg
+            else paddle.distributed.ReduceOp.SUM
+        )
+        # scale will be skiped when reduce_avg comm operation is enabled.
+        if not self._scale_after_comm and not self._use_reduce_avg:
             scale_factor = 1.0 / self._comm_group.nranks
             self.grad_storage.scale_(scale_factor)
 
         if self._act == HOOK_ACTION.ALL_REDUCE:
             task = paddle.distributed.all_reduce(
-                self.grad_storage, group=self._comm_group, sync_op=False
+                self.grad_storage,
+                op=reduce_op,
+                group=self._comm_group,
+                sync_op=False,
             )
 
         elif self._act == HOOK_ACTION.REDUCE:
             task = paddle.distributed.reduce(
                 self.grad_storage,
                 dst=self._dst,
+                op=reduce_op,
                 group=self._comm_group,
                 sync_op=False,
             )
@@ -598,6 +610,7 @@ def _comm_grads(self):
             task = paddle.distributed.reduce_scatter(
                 reduce_scattered,
                 self.grad_storage,
+                op=reduce_op,
                 group=self._comm_group,
                 sync_op=False,
             )
@@ -608,7 +621,8 @@ def scale_grads(self):
         assert self._task is not None, "Task is not initialized."
         self._task.wait()
 
-        if self._scale_after_comm:
+        # scale will be skiped when use reduce_avg comm operation
+        if self._scale_after_comm and not self.use_reduce_avg:
             scale_factor = 1.0 / self._comm_group.nranks
             self.grad_storage.scale_(scale_factor)
 
@@ -636,6 +650,7 @@ def obtain_storage(
     dst=-1,
     acc_steps=1,
     scale_after_comm=False,
+    use_reduce_avg=False,
 ):
     if len(parameters) < 1:
         return [], []
@@ -654,6 +669,7 @@ def obtain_storage(
             use_main_grad=use_main_grad,
             fuse_param=fuse_param,
             scale_after_comm=scale_after_comm,
+            use_reduce_avg=use_reduce_avg,
         )
         if fuse_param:
             param_buffer = comm_buffer.param_storage
@@ -714,6 +730,7 @@ def _fused_parameters_impl(
     acc_step=1,
     scale_after_comm=False,
     apply_decay_param_fun=None,
+    use_reduce_avg=False,
 ):
     param_groups = []
     attrs = []
@@ -764,6 +781,7 @@ def _fused_parameters_impl(
             dst=dst,
             acc_steps=acc_step,
             scale_after_comm=scale_after_comm,
+            use_reduce_avg=use_reduce_avg,
         )
         other, other_buffers = obtain_storage(
             other_params,
@@ -777,6 +795,7 @@ def _fused_parameters_impl(
             dst=dst,
             acc_steps=acc_step,
             scale_after_comm=scale_after_comm,
+            use_reduce_avg=use_reduce_avg,
         )
         decay_fused += decay
         all_fused += decay
@@ -799,6 +818,7 @@ def fused_parameters(
     scale_after_comm=False,
     group_params=False,
     apply_decay_param_fun=None,
+    use_reduce_avg=False,
 ):
     """
     Fuse gradients. Fuse parameters if be enabled. Prepare for comm overlap if be enabled.
@@ -813,6 +833,7 @@ def fused_parameters(
     :param scale_after_comm: if enable comm overlap, specify the location of grad scale
     :param group_params: the format of the input parameters is param group
     :param apply_decay_param_fun: the function to filter decay param
+    :param use_reduce_avg: use reduce_avg comm operation instead of scale and reduce_sum
     :return: param storage if fused, comm buffers if comm overlap, param groups if use group params
     """
     if act is None:
@@ -859,6 +880,7 @@ def fused_parameters(
                 acc_step=acc_step,
                 scale_after_comm=scale_after_comm,
                 apply_decay_param_fun=apply_decay_param_fun,
+                use_reduce_avg=use_reduce_avg,
             )
             if comm_overlap:
                 comm_buffers.extend(group_all_buffers)
@@ -879,6 +901,7 @@ def fused_parameters(
             acc_step=acc_step,
             scale_after_comm=scale_after_comm,
             apply_decay_param_fun=apply_decay_param_fun,
+            use_reduce_avg=use_reduce_avg,
         )
 
         return decay_fused, all_fused, all_buffers
diff --git a/test/collective/fleet/dygraph_group_sharded_stage1_fp16.py b/test/collective/fleet/dygraph_group_sharded_stage1_fp16.py
index 93e163b9facca..e1de31cbc543a 100644
--- a/test/collective/fleet/dygraph_group_sharded_stage1_fp16.py
+++ b/test/collective/fleet/dygraph_group_sharded_stage1_fp16.py
@@ -83,6 +83,9 @@ def train_mlp(
     accumulate_grad=False,
     use_main_grad=False,
     test_scaler=False,
+    sharding_use_reduce_avg=False,
+    comm_overlap=False,
+    tensor_fusion=False,
 ):
     scaler = None
     scale_loss = 1024
@@ -120,6 +123,13 @@ def train_mlp(
             "sharding_degree": 2,
         }
         strategy.hybrid_configs = hybrid_configs
+        strategy.hybrid_configs[
+            "sharding_configs"
+        ].use_reduce_avg = sharding_use_reduce_avg
+        strategy.hybrid_configs["sharding_configs"].comm_overlap = comm_overlap
+        strategy.hybrid_configs[
+            "sharding_configs"
+        ].tensor_fusion = tensor_fusion
 
     fleet.init(is_collective=True, strategy=strategy)
     model = fleet.distributed_model(model)
@@ -251,6 +261,39 @@ def test_stage1_fp16():
         ).detach()
         np.testing.assert_array_equal(o2_loss_grad_acc, o1_loss_grad_acc)
 
+    # nccl reduce_avg test
+    mlp7 = MLP()
+    mlp8 = MLP()
+    mlp7.set_state_dict(state_dict)
+    mlp8.set_state_dict(state_dict)
+    losses_reduce_avg = train_mlp(
+        mlp7,
+        sharding_stage=1,
+        use_pure_fp16=True,
+        use_main_grad=True,
+        sharding_use_reduce_avg=True,
+    )
+    losses_reduce_avg_commoverlap = train_mlp(
+        mlp8,
+        sharding_stage=1,
+        use_pure_fp16=True,
+        use_main_grad=True,
+        sharding_use_reduce_avg=True,
+        comm_overlap=True,
+        tensor_fusion=True,
+    )
+    for i in range(len(o2_losses)):
+        loss_reduce_avg = paddle.cast(
+            losses_reduce_avg[i], dtype='float32'
+        ).detach()
+        loss_reduce_avg_commoverlap = paddle.cast(
+            losses_reduce_avg_commoverlap[i], dtype='float32'
+        ).detach()
+        loss = paddle.cast(o2_losses[i], dtype='float32').detach()
+
+        np.testing.assert_array_equal(loss_reduce_avg, loss)
+        np.testing.assert_array_equal(loss_reduce_avg_commoverlap, loss)
+
     return
 
 
From c3229dd405de87211a4af93555c3b5b625cf22fa Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 6 Mar 2024 11:36:28 +0800
Subject: [PATCH 010/114] fix some bug of while test (#62440)

---
 paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
index 6b311820fc81a..ec7191e171937 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -101,7 +101,6 @@ void ApplyCinnPreprocessPass(
     pass_manager->AddPass(
         cinn::dialect::ir::CreateFuseShapeOpsIntoGenerateShapeOpPass());
     pass_manager->AddPass(pir::CreateDeadCodeEliminationPass());
-    pass_manager->AddPass(pir::CreateShapeOptimizationPass());
   }
   pass_manager->AddPass(cinn::dialect::ir::CreateRemoveUnchangedReshapePass());
 
@@ -115,6 +114,7 @@ void ApplyBuildGroupOpPass(
   std::shared_ptr<pir::PassManager> pass_manager = CreatePassManager();
   pass_manager->AddPass(pir::CreateBuildCinnPass());
   if (HasDynamicShape(*program)) {
+    pass_manager->AddPass(pir::CreateShapeOptimizationPass());
     pass_manager->AddPass(cinn::dialect::ir::CreateInsertBroadcastPass());
   }
   pass_manager->Run(program);

From 4bf4895211988d2e802d93adf493f65541b80098 Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Wed, 6 Mar 2024 12:42:56 +0800
Subject: [PATCH 011/114] [PIR][DynamicShape] Fix bug in cinn_op.slice (#62320)

* Fix bug in cinn_op.slice

* bug fix

* fix cinn slice

* support symbol in `starts` and `ends`

* support TensorListShapeOrDataDimExprs
---
 .../infer_symbolic_shape/cinn_op_infer_sym.cc |  69 +++----
 .../infer_sym_slice_utils.h                   | 191 ++++++++++++++++++
 .../infer_symbolic_shape/infer_sym_utils.cc   |  10 +
 .../infer_symbolic_shape/infer_sym_utils.h    |   2 +
 .../paddle_op_infer_sym.cc                    | 180 ++---------------
 .../pir/transforms/shape_optimization_pass.cc |   3 +-
 .../cinn/symbolic/test_op_infer_sym_shape.py  |  17 +-
 7 files changed, 252 insertions(+), 220 deletions(-)
 create mode 100644 paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index 34dd2821d3fc4..d52270e5b3b66 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
 
 namespace cinn::dialect {
@@ -189,52 +190,30 @@ bool ReshapeOpInferSymbolicShape(
 
 bool SliceOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  // TODO(zhangbopd): Not implemented yet, different from the one in paddle
-  // dialect. And Currently only support start/end/axis with single value.
-  pir::AttributeMap attributes = op->attributes();
-
-  auto GetAttrInt64Value = [&](const std::string &name) -> int64_t {
-    std::vector<pir::Attribute> attr =
-        attributes[name].dyn_cast<pir::ArrayAttribute>().AsVector();
-    PADDLE_ENFORCE_GT(
-        attr.size(),
-        0,
-        phi::errors::PreconditionNotMet(
-            "Only Support [%s] op len(%s) == 1 , but received %d.",
-            op->name(),
-            name,
-            attr.size()));
-    return attr[0].dyn_cast<pir::Int64Attribute>().data();
-  };
-
-  const int64_t start = GetAttrInt64Value("starts");
-  const int64_t end = GetAttrInt64Value("ends");
-  const int64_t axis = GetAttrInt64Value("axes");
-
-  const pir::Value operand_source = op->operand_source(0);
-  const auto &operand_shape_or_data =
-      shape_analysis->GetShapeOrDataForValue(operand_source);
+  const std::vector<int64_t> starts_raw =
+      paddle::dialect::details::GetVectorAttr(op, "starts");
+  const std::vector<int64_t> ends_raw =
+      paddle::dialect::details::GetVectorAttr(op, "ends");
+  const std::vector<int64_t> axes_raw =
+      paddle::dialect::details::GetVectorAttr(op, "axes");
+  const std::vector<int64_t> infer_flags_raw =
+      paddle::dialect::details::GetVectorAttr(op, "infer_flags");
+  const std::vector<int64_t> decrease_axis_raw =
+      paddle::dialect::details::GetVectorAttr(op, "decrease_axis");
+
+  const ExprVec starts = paddle::dialect::details::VecInt642Expr(starts_raw);
+  const ExprVec ends = paddle::dialect::details::VecInt642Expr(ends_raw);
+
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0),
+      paddle::dialect::slice_uitls::SliceRawInferSymbolicShape(
+          shape_analysis->GetShapeOrDataForValue(op->operand_source(0)),
+          starts,
+          ends,
+          axes_raw,
+          infer_flags_raw,
+          decrease_axis_raw));
 
-  const auto GetOutDimExprs = [&]() -> symbol::TensorShapeOrDataDimExprs {
-    std::vector<symbol::DimExpr> out_sym_shape = operand_shape_or_data.shape();
-    if (end == std::numeric_limits<int>::max()) {
-      out_sym_shape[axis] = out_sym_shape[axis] - start;
-    } else {
-      out_sym_shape[axis] = end - start;
-    }
-    symbol::TensorShapeOrDataDimExprs shape_dim_expr(out_sym_shape);
-    if (operand_shape_or_data.data().has_value()) {
-      std::vector<symbol::DimExpr> out_data;
-      for (int64_t i = start; i < end; i++) {
-        out_data.push_back(operand_shape_or_data.data().value()[i]);
-      }
-      shape_dim_expr.SetData(out_data);
-    }
-    return shape_dim_expr;
-  };
-  symbol::ShapeOrDataDimExprs shape_data{GetOutDimExprs()};
-
-  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
   return true;
 }
 
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h
new file mode 100644
index 0000000000000..4e6a026748196
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h
@@ -0,0 +1,191 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
+
+namespace paddle::dialect::slice_uitls {
+
+inline ExprVec GetExprVecFromData(const ShapeOrData &shapeordata) {
+  if (shapeordata.isa<TensorListExprs>()) {
+    ExprVec result;
+    TensorListExprs list =
+        shapeordata.dyn_cast<symbol::TensorListShapeOrDataDimExprs>();
+    for (size_t i = 0; i < list.size(); i++) {
+      for (auto expr : list[i].data().value()) {
+        result.emplace_back(expr);
+      }
+    }
+    return result;
+  } else {
+    return shapeordata.data().value();
+  }
+}
+
+inline void CheckAndUpdateSliceAttrs(
+    const ExprVec &in_dims,
+    const std::vector<int64_t> &axes,
+    ExprVec *starts_p,
+    ExprVec *ends_p,
+    std::vector<int64_t> *infer_flags = nullptr) {
+  ExprVec &starts = *starts_p;
+  ExprVec &ends = *ends_p;
+  auto IsMaxInt = [](const symbol::DimExpr &expr) {
+    return expr.isa<int64_t>() &&
+           expr.Get<int64_t>() ==
+               static_cast<int64_t>(std::numeric_limits<int>::max());
+  };
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    int64_t axis = axes[i];
+    int64_t start_i = 0;
+    if (starts[i].isa<int64_t>()) {
+      start_i = starts[i].Get<int64_t>();
+    }
+    int64_t end_i = 0;
+    if (ends[i].isa<int64_t>()) {
+      end_i = ends[i].Get<int64_t>();
+    }
+
+    // For both start and end can be negative or positive, we need to handle the
+    // following different arrangements.
+    ends[i] = IsMaxInt(ends[i]) ? in_dims[axis] : ends[i];
+
+    bool both_negative_or_positive =
+        (start_i >= 0 && end_i >= 0) || (start_i <= 0 && end_i <= 0);
+    bool start_negative_end_positive = start_i <= 0 && end_i >= 0;
+    bool start_positive_end_negative = start_i >= 0 && end_i <= 0;
+
+    if (both_negative_or_positive) {
+      continue;
+    } else if (start_negative_end_positive) {
+      starts[i] = starts[i] + in_dims[axis];
+    } else if (start_positive_end_negative) {
+      starts[i] = starts[i] - in_dims[axis];
+    } else {
+      LOG(FATAL) << "Dead code";
+    }
+  }
+}
+
+inline ExprVec GetSliceDims(const ExprVec &in_dims,
+                            const std::vector<int64_t> &axes,
+                            const ExprVec &starts,
+                            const ExprVec &ends,
+                            std::vector<int64_t> *infer_flags = nullptr) {
+  ExprVec slice_dims(in_dims);
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    int64_t axis = axes[i];
+    slice_dims[axis] = ends[i] - starts[i];
+  }
+
+  return slice_dims;
+}
+
+inline ExprVec GetDecreasedDims(const ExprVec &slice_dims,
+                                const std::vector<int64_t> &decrease_axes) {
+  ExprVec decreased_dims(slice_dims);
+  std::vector<uint8_t> decrease_flag(slice_dims.size(), 0);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      int64_t axis = decrease_axes[i];
+      decrease_flag[axis] = 1;
+    }
+    ExprVec new_shape;
+    for (size_t i = 0; i < slice_dims.size(); ++i) {
+      if (decrease_flag[i] == 0) {
+        new_shape.emplace_back(slice_dims[i]);
+      }
+    }
+    decreased_dims = new_shape;
+  }
+  return decreased_dims;
+}
+
+inline std::vector<int64_t> FormatSliceAxes(
+    const std::vector<int64_t> &axes_raw, int64_t rank) {
+  std::vector<int64_t> axes_vec(axes_raw.size(), 0);
+  std::transform(
+      axes_raw.begin(), axes_raw.end(), axes_vec.begin(), [rank](int64_t axis) {
+        return axis >= 0 ? axis : std::max(int64_t(0), axis + rank);
+      });
+  return axes_vec;
+}
+
+inline ShapeOrData SliceRawInferSymbolicShape(
+    const ShapeOrData &in_shapeordata,
+    const ExprVec &starts_expr,
+    const ExprVec &ends_expr,
+    const std::vector<int64_t> &axes_raw,
+    const std::vector<int64_t> &infer_flags_raw,
+    const std::vector<int64_t> &decrease_axis) {
+  ExprVec starts = starts_expr;
+  ExprVec ends = ends_expr;
+  std::vector<int64_t> infer_flags = [&infer_flags_raw, &axes_raw] {
+    return infer_flags_raw.empty() ? std::vector<int64_t>(axes_raw.size(), 1)
+                                   : infer_flags_raw;
+  }();
+
+  const auto &GetShapeDimExprs = [&]() -> symbol::ShapeOrDataDimExprs {
+    const ExprVec &in_dims = in_shapeordata.shape();
+    std::vector<int64_t> axes = FormatSliceAxes(axes_raw, in_dims.size());
+    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &infer_flags);
+    ExprVec slice_dims =
+        GetSliceDims(in_dims, axes, starts, ends, &infer_flags);
+    ExprVec out_dims = GetDecreasedDims(slice_dims, decrease_axis);
+
+    return symbol::ShapeOrDataDimExprs{
+        symbol::TensorShapeOrDataDimExprs(out_dims)};
+  };
+
+  // When `pd.slice` is operating on a tensor which is produced by a `pd.shape`
+  // op, the result should be written into data.
+  const auto &GetDataDimExprs = [&]() -> symbol::ShapeOrDataDimExprs {
+    std::vector<symbol::DimExpr> out_data;
+
+    // Currently, we DO NOT support the case that any element in `axes` `starts`
+    // or `ends` is a Symbol.
+    auto vec_int64 = details::VecExpr2Int64(starts);
+    IR_ENFORCE(vec_int64.has_value(),
+               "for slice op, all the elements in `starts` must be int64_t");
+    std::vector<int64_t> starts_int = vec_int64.value();
+
+    vec_int64 = details::VecExpr2Int64(ends);
+    IR_ENFORCE(vec_int64.has_value(),
+               "for slice op, all the elements in `ends` must be int64_t");
+    std::vector<int64_t> ends_int = vec_int64.value();
+
+    const int64_t start =
+        starts_int[0] < 0 ? starts_int[0] + in_shapeordata.data().value().size()
+                          : starts_int[0];
+    const int64_t end =
+        static_cast<int64_t>(std::numeric_limits<int>::max()) == ends_int[0]
+            ? in_shapeordata.data().value().size()
+            : ends_int[0];
+
+    for (int64_t i = start; i < end; i++) {
+      out_data.push_back(in_shapeordata.data().value()[i]);
+    }
+
+    const std::vector<symbol::DimExpr> shape{std::int64_t(out_data.size())};
+    return symbol::ShapeOrDataDimExprs{
+        symbol::TensorShapeOrDataDimExprs(shape, out_data)};
+  };
+
+  return in_shapeordata.data().has_value() ? GetDataDimExprs()
+                                           : GetShapeDimExprs();
+}
+}  // namespace paddle::dialect::slice_uitls
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
index c417df6bc79c0..12fec5b091152 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
@@ -27,6 +27,16 @@ std::optional<std::vector<int64_t>> VecExpr2Int64(const ExprVec &expr_vec) {
   return int64vec;
 }
 
+ExprVec VecInt642Expr(const std::vector<int64_t> &int_vec) {
+  ExprVec expr_vec(int_vec.size(), 0);
+  std::transform(
+      int_vec.begin(),
+      int_vec.end(),
+      expr_vec.begin(),
+      [](int64_t val) -> symbol::DimExpr { return symbol::DimExpr(val); });
+  return expr_vec;
+}
+
 bool ReduceInferDim(pir::Operation *op,
                     pir::ShapeConstraintIRAnalysis *shape_analysis,
                     const std::vector<int64_t> &axis,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
index 4be08cde7a619..8c13e38b54de3 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
@@ -77,6 +77,8 @@ std::vector<T> GetVectorAttr(const ::pir::Operation *op,
 
 std::optional<std::vector<int64_t>> VecExpr2Int64(const ExprVec &expr_vec);
 
+ExprVec VecInt642Expr(const std::vector<int64_t> &int_vec);
+
 bool ReduceInferDim(pir::Operation *op,
                     pir::ShapeConstraintIRAnalysis *shape_analysis,
                     const std::vector<int64_t> &axis,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index ec4212c27ce84..9003b88c18fd3 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h"
 #include "paddle/common/ddim.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 
@@ -185,102 +186,6 @@ bool FullIntArrayOpInferSymbolicShape(
   return true;
 }
 
-inline void CheckAndUpdateSliceAttrs(
-    const ExprVec &in_dims,
-    const std::vector<int64_t> &axes,
-    ExprVec *starts_p,
-    ExprVec *ends_p,
-    std::vector<int64_t> *infer_flags = nullptr) {
-  auto vec_int64 = details::VecExpr2Int64(*starts_p);
-  IR_ENFORCE(vec_int64.has_value(),
-             "for slice op, all the elements in `starts` must be int64_t");
-  std::vector<int64_t> starts_int = vec_int64.value();
-
-  vec_int64 = details::VecExpr2Int64(*ends_p);
-  IR_ENFORCE(vec_int64.has_value(),
-             "for slice op, all the elements in `ends` must be int64_t");
-  std::vector<int64_t> ends_int = vec_int64.value();
-
-  ExprVec &starts = *starts_p;
-  ExprVec &ends = *ends_p;
-  auto IsMaxInt = [](const symbol::DimExpr &expr) {
-    return expr.isa<int64_t>() &&
-           expr.Get<int64_t>() ==
-               static_cast<int64_t>(std::numeric_limits<int>::max());
-  };
-
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int64_t axis = axes[i];
-
-    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
-      PADDLE_THROW(
-          phi::errors::Unimplemented("SliceOpInferSymbolicShape CAN NOT "
-                                     "deal with -1 in infer_flags now"));
-    }
-
-    // For both start and end can be negative or positive, we need to handle the
-    // following different arrangements.
-    ends[i] = IsMaxInt(ends[i]) ? in_dims[axis] : ends[i];
-
-    bool both_negative_or_positive = (starts_int[i] >= 0 && ends_int[i] >= 0) ||
-                                     (starts_int[i] <= 0 && ends_int[i] <= 0);
-    bool start_negative_end_positive = starts_int[i] <= 0 && ends_int[i] >= 0;
-    bool start_positive_end_negative = starts_int[i] >= 0 && ends_int[i] <= 0;
-
-    if (both_negative_or_positive) {
-      continue;
-    } else if (start_negative_end_positive) {
-      starts[i] = starts[i] + in_dims[axis];
-    } else if (start_positive_end_negative) {
-      starts[i] = starts[i] - in_dims[axis];
-    } else {
-      LOG(FATAL) << "Dead code";
-    }
-  }
-}
-
-inline ExprVec GetSliceDims(const ExprVec &in_dims,
-                            const std::vector<int64_t> &axes,
-                            const ExprVec &starts,
-                            const ExprVec &ends,
-                            std::vector<int64_t> *infer_flags = nullptr) {
-  ExprVec slice_dims(in_dims);
-
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int64_t axis = axes[i];
-
-    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
-      PADDLE_THROW(
-          phi::errors::Unimplemented("SliceOpInferSymbolicShape CAN NOT "
-                                     "deal with -1 in infer_flags now"));
-    }
-
-    slice_dims[axis] = ends[i] - starts[i];
-  }
-
-  return slice_dims;
-}
-
-inline ExprVec GetDecreasedDims(const ExprVec &slice_dims,
-                                const std::vector<int64_t> &decrease_axes) {
-  ExprVec decreased_dims(slice_dims);
-  std::vector<uint8_t> decrease_flag(slice_dims.size(), 0);
-  if (decrease_axes.size() > 0) {
-    for (size_t i = 0; i < decrease_axes.size(); ++i) {
-      int64_t axis = decrease_axes[i];
-      decrease_flag[axis] = 1;
-    }
-    ExprVec new_shape;
-    for (size_t i = 0; i < slice_dims.size(); ++i) {
-      if (decrease_flag[i] == 0) {
-        new_shape.emplace_back(slice_dims[i]);
-      }
-    }
-    decreased_dims = new_shape;
-  }
-  return decreased_dims;
-}
-
 bool SliceOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   pir::Value operand_source = op->operand_source(0);
@@ -295,83 +200,26 @@ bool SliceOpInferSymbolicShape(pir::Operation *op,
   const symbol::ShapeOrDataDimExprs &ends_shape_data =
       shape_analysis->GetShapeOrDataForValue(operand_ends);
 
-  const std::vector<int64_t> axes = [&] {
-    std::vector<int64_t> axes_vec = details::GetVectorAttr(op, "axes");
-    int64_t rank = int64_t(operand_shape_or_data.shape().size());
-    for (size_t i = 0; i < axes_vec.size(); i++) {
-      int64_t axis = axes_vec[i];
-      axes_vec[i] = axis >= 0 ? axis : std::max(int64_t(0), axis + rank);
-    }
-    return axes_vec;
-  }();
+  std::vector<int64_t> axes_vec = details::GetVectorAttr(op, "axes");
 
-  // Currently, we DO NOT support any element in `starts` is a Symbol.
-  ExprVec starts = starts_shape_data.data().value();
-  ExprVec ends = ends_shape_data.data().value();
+  // // Currently, we DO NOT support any element in `starts` is a Symbol.
+  ExprVec starts = slice_uitls::GetExprVecFromData(starts_shape_data);
+  ExprVec ends = slice_uitls::GetExprVecFromData(ends_shape_data);
 
-  std::vector<int64_t> infer_flags = [op, &axes] {
-    std::vector<int64_t> infer_flags_t =
-        details::GetVectorAttr(op, "infer_flags");
-    if (infer_flags_t.empty()) {
-      infer_flags_t = std::vector<int64_t>(axes.size(), 1);
-    }
-    return infer_flags_t;
-  }();
+  std::vector<int64_t> infer_flags = details::GetVectorAttr(op, "infer_flags");
 
   const std::vector<int64_t> decrease_axis =
       details::GetVectorAttr(op, "decrease_axis");
 
-  const auto &GetShapeDimExprs = [&]() -> symbol::ShapeOrDataDimExprs {
-    const ExprVec &in_dims = operand_shape_or_data.shape();
-    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &infer_flags);
-    ExprVec slice_dims =
-        GetSliceDims(in_dims, axes, starts, ends, &infer_flags);
-    ExprVec out_dims = GetDecreasedDims(slice_dims, decrease_axis);
-
-    return symbol::ShapeOrDataDimExprs{
-        symbol::TensorShapeOrDataDimExprs(out_dims)};
-  };
-
-  // When `pd.slice` is operating on a tensor which is produced by a `pd.shape`
-  // op, the result should be written into data.
-  const auto &GetDataDimExprs = [&]() -> symbol::ShapeOrDataDimExprs {
-    std::vector<symbol::DimExpr> out_data;
-
-    // Currently, we DO NOT support the case that any element in `axes` `starts`
-    // or `ends` is a Symbol.
-    auto vec_int64 = details::VecExpr2Int64(starts);
-    IR_ENFORCE(vec_int64.has_value(),
-               "for slice op, all the elements in `starts` must be int64_t");
-    std::vector<int64_t> starts_int = vec_int64.value();
-
-    vec_int64 = details::VecExpr2Int64(ends);
-    IR_ENFORCE(vec_int64.has_value(),
-               "for slice op, all the elements in `ends` must be int64_t");
-    std::vector<int64_t> ends_int = vec_int64.value();
-
-    const int64_t start =
-        starts_int[0] < 0
-            ? starts_int[0] + operand_shape_or_data.data().value().size()
-            : starts_int[0];
-    const int64_t end =
-        static_cast<int64_t>(std::numeric_limits<int>::max()) == ends_int[0]
-            ? operand_shape_or_data.data().value().size()
-            : ends_int[0];
-
-    for (int64_t i = start; i < end; i++) {
-      out_data.push_back(operand_shape_or_data.data().value()[i]);
-    }
-
-    const std::vector<symbol::DimExpr> shape{std::int64_t(out_data.size())};
-    return symbol::ShapeOrDataDimExprs{
-        symbol::TensorShapeOrDataDimExprs(shape, out_data)};
-  };
-
-  symbol::ShapeOrDataDimExprs shape_data =
-      operand_shape_or_data.data().has_value() ? GetDataDimExprs()
-                                               : GetShapeDimExprs();
+  shape_analysis->SetShapeOrDataForValue(
+      res,
+      slice_uitls::SliceRawInferSymbolicShape(operand_shape_or_data,
+                                              starts,
+                                              ends,
+                                              axes_vec,
+                                              infer_flags,
+                                              decrease_axis));
 
-  shape_analysis->SetShapeOrDataForValue(res, shape_data);
   return true;
 }
 
diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.cc b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
index 85f4a5a5eef49..374655da35ef4 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.cc
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
@@ -131,7 +131,8 @@ void InferSymExprForBlock(const Block& block,
     auto infer_symbolic_shape_interface =
         op.dyn_cast<paddle::dialect::InferSymbolicShapeInterface>();
     if (infer_symbolic_shape_interface) {
-      VLOG(vlog_level) << op.name() << " has InferSymbolicShapeInterface.";
+      VLOG(vlog_level) << op.name() << "(op_id: op_" << op.id() << ")"
+                       << " has InferSymbolicShapeInterface.";
       PADDLE_ENFORCE_EQ(
           infer_symbolic_shape_interface.InferSymbolicShape(shape_analysis),
           true,
diff --git a/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
index 4ab27bf657eac..a3f7df02e1ed7 100644
--- a/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
+++ b/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
@@ -465,12 +465,12 @@ def __init__(self):
 
     def forward(self, x):
         out = x[:, -1, :]
-        out = x[1:3, 0:2, 2:4]
+        # out = x[1:3, 0:2, 2:4]
 
-        axes = [0, 1, 2]
-        starts = [-3, 0, 2]
-        ends = [3, 2, 4]
-        out = paddle.slice(x, axes=axes, starts=starts, ends=ends)
+        # axes = [0, 1, 2]
+        # starts = [-3, 0, 2]
+        # ends = [3, 2, 4]
+        # out = paddle.slice(x, axes=axes, starts=starts, ends=ends)
 
         return out
 
@@ -482,8 +482,8 @@ def prepare_data(self):
         self.expected = [
             [
                 'shape[S0, S2], data[NULL]',
-                'shape[2, 2, 2], data[NULL]',
-                'shape[Add(3, -Add(-3, S0)), 2, 2]',
+                # 'shape[2, 2, 2], data[NULL]',
+                # 'shape[Add(3, -Add(-3, S0)), 2, 2]',
             ]
         ]
 
@@ -497,7 +497,8 @@ def test_eval_symbolic(self):
             )
 
             input_spec = [x_spec]
-            net = apply_to_static(net, False, input_spec)
+            # net = apply_to_static(net, False, input_spec)
+            net = apply_to_static(net, True, input_spec)
             net.eval()
 
             # check the infer result

From 19a5ae5b652a6dd683f8bec6058370353e977e0a Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Wed, 6 Mar 2024 13:23:00 +0800
Subject: [PATCH 012/114] fix use nvidia cuda libraries bug (#62425)

* fix

* fix

* fix
---
 CMakeLists.txt                                | 10 ++-
 paddle/phi/backends/dynload/dynamic_loader.cc | 66 +++++++++++++++++--
 python/env_dict.py.in                         |  3 +-
 python/setup.py.in                            |  5 +-
 setup.py                                      |  5 +-
 5 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d5e260f323a0c..3cdcd291e62e5 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,8 @@ option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
 option(WITH_SHARED_PHI "Compile PaddlePaddle with SHARED LIB of PHI" ON)
 option(CINN_ONLY "Compile CINN only in Paddle" OFF)
 option(CINN_WITH_CUDNN "Compile CINN with CUDNN support" ON)
-
+option(WITH_PIP_CUDA_LIBRARIES
+       "Paddle uses the CUDA library provided by NVIDIA" OFF)
 find_package(Git REQUIRED)
 
 # config GIT_URL with github mirrors to speed up dependent repos clone
@@ -97,11 +98,16 @@ endif()
 
 if(WITH_GPU AND NOT APPLE)
   #(Note risemeup1): The cudart dynamic library libcudart.so is used by set CUDA_USE_STATIC_CUDA_RUNTIME and CMAKE_CUDA_FLAGS
-  if(LINUX)
+  if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL
+                                            "x86_64")
     set(CUDA_USE_STATIC_CUDA_RUNTIME
         OFF
         CACHE BOOL "" FORCE)
     set(CMAKE_CUDA_FLAGS "--cudart shared")
+    if(WITH_PIP_CUDA_LIBRARIES)
+      #(Note risemeup1): Flag 'WITH_PIP_CUDA_LIBRARIES' will be used in dynamic_loader.cc to search for CUDA-related .so files through the Python libraries provided by NVIDIA.
+      add_definitions(-DWITH_PIP_CUDA_LIBRARIES)
+    endif()
   endif()
   enable_language(CUDA)
   message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index efdac108bcc8e..101f156e1f488 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -289,9 +289,17 @@ void* GetCublasDsoHandle() {
       FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.11");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
+#endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so");
+#endif
   } else {
     std::string warning_msg(
         "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
@@ -309,9 +317,17 @@ void* GetCublasLtDsoHandle() {
 // APIs available after CUDA 10.1
 #if defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.11");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
+#endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
+#endif
   } else {
     std::string warning_msg(
         "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
@@ -353,8 +369,13 @@ void* GetCUDNNDsoHandle() {
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
 #else
+#ifdef WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
       FLAGS_cudnn_dir, "libcudnn.so.8", false, {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
+#endif
 #endif
 }
 
@@ -364,11 +385,22 @@ void* GetCUPTIDsoHandle() {
       FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libcupti.so.11.7", false, {cupti_lib_path});
+        FLAGS_cupti_dir, "libcupti.so.11.8", false, {cupti_lib_path});
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
+
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(
         FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path});
+#else
+    return GetDsoHandleFromSearchPath(
+        FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
+#endif
   } else {
     std::string warning_msg(
         "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
@@ -377,7 +409,7 @@ void* GetCUPTIDsoHandle() {
   }
 #else
   return GetDsoHandleFromSearchPath(
-      FLAGS_cupti_dir, "libcupti.so.11.7", false, {cupti_lib_path});
+      FLAGS_cupti_dir, "libcupti.so", false, {cupti_lib_path});
 #endif
 }
 
@@ -390,7 +422,12 @@ void* GetCurandDsoHandle() {
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
 #else
+#ifdef WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so.10");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so");
+#endif
+
 #endif
 }
 
@@ -422,7 +459,11 @@ void* GetCusolverDsoHandle() {
   return GetDsoHandleFromSearchPath(
       FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
 #else
+#ifdef WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
+#endif
 #endif
 }
 
@@ -434,9 +475,17 @@ void* GetCusparseDsoHandle() {
       FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.11");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
+#endif
   } else if (CUDA_VERSION >= 12000 && CUDA_VERSION <= 12030) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12");
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so");
+#endif
   } else {
     std::string warning_msg(
         "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
@@ -536,8 +585,14 @@ void* GetNCCLDsoHandle() {
   return GetDsoHandleFromSearchPath(
       FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
 #else
+#ifdef WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
       FLAGS_nccl_dir, "libnccl.so.2", true, {}, warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(
+      FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
+#endif
+
 #endif
 }
 
@@ -592,8 +647,12 @@ void* GetCUFFTDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
 #elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
   if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
+#ifdef WITH_PIP_CUDA_LIBRARIES
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.10");
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
+#else
+    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
+#endif
+  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 12030) {
     return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.11");
   } else {
     std::string warning_msg(
@@ -639,6 +698,5 @@ void* GetXPTIDsoHandle() {
   return nullptr;
 #endif
 }
-
 }  // namespace dynload
 }  // namespace phi
diff --git a/python/env_dict.py.in b/python/env_dict.py.in
index a276adb00085e..301254edbf38d 100644
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -90,5 +90,6 @@ env_dict={
     'CINN_SOURCE_DIR':'@CINN_SOURCE_DIR@',
     'WITH_CPP_DIST':'@WITH_CPP_DIST@',
     'PADDLE_INSTALL_DIR':'@PADDLE_INSTALL_DIR@',
-    'PADDLE_LIB_TEST_DIR':'@PADDLE_LIB_TEST_DIR@'
+    'PADDLE_LIB_TEST_DIR':'@PADDLE_LIB_TEST_DIR@',
+    'WITH_PIP_CUDA_LIBRARIES':'@WITH_PIP_CUDA_LIBRARIES@'
 }
diff --git a/python/setup.py.in b/python/setup.py.in
index 98246fdbf4dc5..5c2f941a65c80 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -407,10 +407,7 @@ write_distributed_training_mode_py(filename='@PADDLE_BINARY_DIR@/python/paddle/i
 
 def get_paddle_extra_install_requirements():
     #(Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas, thereby making the operation of 'pip install paddle' no longer dependent on the installation of cuda and cudnn.
-    paddle_cuda_install_requirements = os.getenv(
-        "PADDLE_CUDA_INSTALL_REQUIREMENTS", None
-    )
-    if paddle_cuda_install_requirements == "ON":
+    if '@WITH_PIP_CUDA_LIBRARIES@' == 'ON':
         PADDLE_CUDA_INSTALL_REQUIREMENTS = {
             "V11": (
                 "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
diff --git a/setup.py b/setup.py
index fd94bfa11accd..5550a3ee66f4f 100644
--- a/setup.py
+++ b/setup.py
@@ -936,10 +936,7 @@ def get_setup_requires():
 
 def get_paddle_extra_install_requirements():
     # (Note risemeup1): Paddle will install the pypi cuda package provided by Nvidia, which includes the cuda runtime, cudnn, and cublas, thereby making the operation of 'pip install paddle' no longer dependent on the installation of cuda and cudnn.
-    paddle_cuda_install_requirements = os.getenv(
-        "PADDLE_CUDA_INSTALL_REQUIREMENTS", None
-    )
-    if paddle_cuda_install_requirements == "ON":
+    if env_dict.get("WITH_PIP_CUDA_LIBRARIES") == "ON":
         PADDLE_CUDA_INSTALL_REQUIREMENTS = {
             "V11": (
                 "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "

From 90529ac2122575fc2736d26792cd6f9da0df67b3 Mon Sep 17 00:00:00 2001
From: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Date: Wed, 6 Mar 2024 13:35:54 +0800
Subject: [PATCH 013/114] [Paddle-TRT]add inference
 api:exp_disable_tensorrt_dynamic_shape_ops (#62352)

---
 paddle/fluid/inference/analysis/argument.h    |   2 +
 .../inference/analysis/ir_pass_manager.cc     |   3 +
 .../ir_passes/tensorrt_subgraph_pass.cc       |   9 +-
 paddle/fluid/inference/api/analysis_config.cc |   9 ++
 .../fluid/inference/api/analysis_predictor.cc |   2 +
 .../inference/api/paddle_analysis_config.h    |   4 +
 paddle/fluid/inference/tensorrt/op_teller.cc  |  61 ++++++++
 paddle/fluid/inference/tensorrt/op_teller.h   |   2 +
 paddle/fluid/pybind/inference_api.cc          |   3 +
 .../inference/test_forbid_dynamic_op_api.py   | 138 ++++++++++++++++++
 10 files changed, 231 insertions(+), 2 deletions(-)
 create mode 100644 test/ir/inference/test_forbid_dynamic_op_api.py

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 1407a8f875a29..8c4fbceced1ab 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -251,6 +251,8 @@ struct Argument {
   DECL_ARGUMENT_FIELD(trt_exclude_var_names,
                       TRTExcludeVarNames,
                       std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(trt_forbid_dynamic_op, TRTForbidDynamicOp, bool);
+
   DECL_ARGUMENT_FIELD(tensorrt_disabled_ops,
                       TensorRtDisabledOPs,
                       std::vector<std::string>);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index eca0c8fedd0a2..cc126e5fea612 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -173,6 +173,9 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set(
           "trt_exclude_var_names",
           new std::vector<std::string>(argument->trt_exclude_var_names()));
+      pass->Set("forbid_dynamic_op",
+                new bool(argument->trt_forbid_dynamic_op()));
+
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
       pass->Set("predictor_id", new int(argument->predictor_id()));
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 1b29ba37f5e66..d6441cc6d4a56 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -153,12 +153,14 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
   auto trt_disabled_ops = Get<std::vector<std::string>>("trt_disabled_ops");
   auto with_dynamic_shape = Get<bool>("with_dynamic_shape");
   auto use_explicit_quantization = Get<bool>("use_explicit_quantization");
+  auto forbid_dynamic_op = Get<bool>("forbid_dynamic_op");
   auto teller = [&](const framework::ir::Node *node) {
     if (!node->IsOp() || !node->Op()) return false;
     if (find(trt_disabled_ops.begin(),
              trt_disabled_ops.end(),
              node->Op()->Type()) != trt_disabled_ops.end()) {
       VLOG(3) << node->Op()->Type().c_str()
+
               << " is diabled by config in TensorRT";
       return false;
     }
@@ -172,8 +174,11 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
         }
       }
     }
-    bool is_ok = tensorrt::OpTeller::Global().Tell(
-        node, no_calib_int8, with_dynamic_shape, use_explicit_quantization);
+    bool is_ok = tensorrt::OpTeller::Global().Tell(node,
+                                                   no_calib_int8,
+                                                   with_dynamic_shape,
+                                                   forbid_dynamic_op,
+                                                   use_explicit_quantization);
     if (!is_ok)
       VLOG(3) << node->Op()->Type().c_str() << " op is not in TensorRT";
     return is_ok;
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 888e2cbe080c9..5ab33c65208a3 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -462,6 +462,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(tensorrt_min_subgraph_size_);
   CP_MEMBER(tensorrt_precision_mode_);
   CP_MEMBER(trt_mark_output_);
+  CP_MEMBER(trt_forbid_dynamic_op_)
   CP_MEMBER(trt_output_tensor_names_);
   CP_MEMBER(trt_disabled_ops_);
   CP_MEMBER(trt_use_dla_);
@@ -781,6 +782,11 @@ void AnalysisConfig::MarkTrtEngineOutputs(
   trt_output_tensor_names_ = output_tensor_names;
 }
 
+void AnalysisConfig::Exp_DisableTensorRTDynamicShapeOPs(
+    bool trt_forbid_dynamic_op) {
+  trt_forbid_dynamic_op_ = trt_forbid_dynamic_op;
+}
+
 void AnalysisConfig::EnableTensorRTMemoryOptim(bool engine_memory_sharing,
                                                int sharing_identifier) {
   PADDLE_ENFORCE_EQ(
@@ -1129,6 +1135,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << tensorrt_max_batchsize_;
   ss << tensorrt_min_subgraph_size_;
   ss << trt_mark_output_;
+  ss << trt_forbid_dynamic_op_;
 
   ss << use_dlnne_;
   ss << dlnne_min_subgraph_size_;
@@ -1418,6 +1425,8 @@ std::string AnalysisConfig::Summary() {
       os.InsertRow({"trt_engine_memory_sharing",
                     trt_engine_memory_sharing_ ? "true" : "false"});
       os.InsertRow({"trt_mark_output", trt_mark_output_ ? "true" : "false"});
+      os.InsertRow(
+          {"trt_forbid_dynamic_op", trt_forbid_dynamic_op_ ? "true" : "false"});
 #endif
     }
   }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 1cc723cd7913e..08e3193ce4365 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1757,6 +1757,8 @@ void AnalysisPredictor::PrepareArgument() {
     argument_->SetTRTOutputTensorNames(config_.trt_output_tensor_names_);
     argument_->SetTensorRtDisabledOPs(config_.trt_disabled_ops_);
     argument_->SetTRTExcludeVarNames(config_.trt_exclude_var_names_);
+    argument_->SetTRTForbidDynamicOp(config_.trt_forbid_dynamic_op_);
+
     argument_->SetTensorRtUseDLA(config_.trt_use_dla_);
     argument_->SetTensorRtDLACore(config_.trt_dla_core_);
     argument_->SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 64b2de0eba3d4..2c5b254ea1c14 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -813,6 +813,8 @@ struct PD_INFER_DECL AnalysisConfig {
   void Exp_DisableTensorRtSubgraph(
       const std::vector<std::string>& var_name_not_trt);
 
+  void Exp_DisableTensorRTDynamicShapeOPs(bool trt_forbid_dynamic_op);
+
   ///
   /// \brief Replace some TensorRT plugins to TensorRT OSS(
   /// https://github.com/NVIDIA/TensorRT), with which some models's inference
@@ -1283,6 +1285,8 @@ struct PD_INFER_DECL AnalysisConfig {
   bool trt_use_varseqlen_{false};
   bool trt_with_interleaved_{false};
   bool trt_mark_output_{false};
+  bool trt_forbid_dynamic_op_{false};
+
   std::vector<std::string> trt_output_tensor_names_{};
   std::vector<std::string> trt_exclude_var_names_{};
   std::string tensorrt_transformer_posid_{""};
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index da46cc80ca5a9..3eb864487e96c 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -34,6 +34,43 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+// Check if it is a dynamic shape. If it is a dynamic shape, return true;
+// otherwise, return false
+bool IsDynamicShapeOp(const framework::OpDesc& desc) {
+  VLOG(3) << "forbid_dynamic_op_enter_into_trt is open";
+  auto* block = desc.Block();
+  auto inputs = desc.Inputs();
+  for (auto iter : inputs) {
+    for (auto var_name : iter.second) {
+      if (block) {
+        auto* var_desc = block->FindVar(var_name);
+        const auto shape = var_desc->GetShape();
+        for (auto ele : shape) {
+          if (ele < 0) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  auto outputs = desc.Outputs();
+  for (auto iter : outputs) {
+    for (auto var_name : iter.second) {
+      if (block) {
+        auto* var_desc = block->FindVar(var_name);
+        const auto shape = var_desc->GetShape();
+        for (auto ele : shape) {
+          if (ele < 0) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
 // Just tell by the op_types.
 struct SimpleOpTypeSetTeller : public Teller {
   SimpleOpTypeSetTeller() {  // NOLINT
@@ -89,6 +126,7 @@ struct SimpleOpTypeSetTeller : public Teller {
   bool operator()(const framework::OpDesc& desc,
                   bool use_no_calib_int8 = false,
                   bool with_dynamic_shape = false,
+                  bool forbid_dynamic_op_enter_into_trt = false,
                   bool use_explicit_quantization = false) override {
     const std::string op_type = desc.Type();
 
@@ -102,6 +140,9 @@ struct SimpleOpTypeSetTeller : public Teller {
     if (feed_fetch_set.find(op_type) != feed_fetch_set.end()) {
       return false;
     }
+    if (forbid_dynamic_op_enter_into_trt && IsDynamicShapeOp(desc)) {
+      return false;
+    }
 
     // do not support the op which is labeled the `skip_quant`
     if ((desc.HasAttr("namescope") &&
@@ -3200,8 +3241,10 @@ struct GenericPluginTeller : public Teller {
   bool operator()(const framework::OpDesc& desc,
                   bool use_no_calib_int8 = false,
                   bool with_dynamic_shape = false,
+                  bool forbid_dynamic_op_enter_into_trt = false,
                   bool use_explicit_quantization = false) override {
     const std::string op_type = desc.Type();
+
     // only consider dynamic_shape mode
     if (!with_dynamic_shape) {
       return false;
@@ -3259,6 +3302,9 @@ struct GenericPluginTeller : public Teller {
         VLOG(3) << op_type << " has no DynamicMetaFn.";
         return false;
       }
+      if (forbid_dynamic_op_enter_into_trt && IsDynamicShapeOp(desc)) {
+        return false;
+      }
       return true;
     }
   }
@@ -3270,6 +3316,7 @@ struct CustomPluginTeller : public Teller {
   bool operator()(const framework::OpDesc& desc,
                   bool use_no_calib_int8 = false,
                   bool with_dynamic_shape = false,
+                  bool forbid_dynamic_op_enter_into_trt = false,
                   bool use_explicit_quantization = false) override {
     const std::string op_type = desc.Type();
     std::string expect_plugin_name;
@@ -3288,6 +3335,9 @@ struct CustomPluginTeller : public Teller {
         return true;
     }
     return false;
+    if (forbid_dynamic_op_enter_into_trt && IsDynamicShapeOp(desc)) {
+      return false;
+    }
   }
 };
 
@@ -3296,8 +3346,10 @@ struct CustomGenericPluginTeller : public Teller {
   bool operator()(const framework::OpDesc& desc,
                   bool use_no_calib_int8 = false,
                   bool with_dynamic_shape = false,
+                  bool forbid_dynamic_op_enter_into_trt = false,
                   bool use_explicit_quantization = false) override {
     const std::string op_type = desc.Type();
+
     auto& op_meta_info_map = OpMetaInfoMap::Instance();
     const auto& meta_info_map = op_meta_info_map.GetMap();
     if (meta_info_map.count(op_type) > 0) {
@@ -3322,15 +3374,20 @@ struct CustomGenericPluginTeller : public Teller {
     }
     VLOG(3) << op_type << " has no meta info";
     return false;
+    if (forbid_dynamic_op_enter_into_trt && IsDynamicShapeOp(desc)) {
+      return false;
+    }
   }
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node,
                     bool use_no_calib_int8,
                     bool with_dynamic_shape,
+                    bool forbid_dynamic_op_enter_into_trt,
                     bool use_explicit_quantization) {
   const std::string op_type = node->Op()->Type();
   const framework::OpDesc desc = *node->Op();
+
   // do not support the op which is labeled the `skip_quant`
   if ((desc.HasAttr("namescope") &&
        PADDLE_GET_CONST(std::string, desc.GetAttr("op_namescope")) ==
@@ -3341,6 +3398,7 @@ bool OpTeller::Tell(const framework::ir::Node* node,
   if ((*default_teller)(desc,
                         use_no_calib_int8,
                         with_dynamic_shape,
+                        forbid_dynamic_op_enter_into_trt,
                         use_explicit_quantization)) {
     SetOpConverterType(node->Op(), OpConverterType::Default);
     return true;
@@ -3349,6 +3407,7 @@ bool OpTeller::Tell(const framework::ir::Node* node,
   if ((*generic_plugin_teller)(desc,
                                use_no_calib_int8,
                                with_dynamic_shape,
+                               forbid_dynamic_op_enter_into_trt,
                                use_explicit_quantization)) {
     SetOpConverterType(node->Op(), OpConverterType::GenericPluginCreater);
     return true;
@@ -3357,6 +3416,7 @@ bool OpTeller::Tell(const framework::ir::Node* node,
   if ((*custom_plugin_teller)(desc,
                               use_no_calib_int8,
                               with_dynamic_shape,
+                              forbid_dynamic_op_enter_into_trt,
                               use_explicit_quantization)) {
     SetOpConverterType(node->Op(), OpConverterType::CustomPluginCreater);
     return true;
@@ -3365,6 +3425,7 @@ bool OpTeller::Tell(const framework::ir::Node* node,
   if ((*custom_generic_plugin_teller)(desc,
                                       use_no_calib_int8,
                                       with_dynamic_shape,
+                                      forbid_dynamic_op_enter_into_trt,
                                       use_explicit_quantization)) {
     SetOpConverterType(node->Op(), OpConverterType::CustomGenericPluginCreater);
     return true;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index 9c909c2d71c06..f955396b9ac11 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -41,6 +41,7 @@ struct Teller {
   virtual bool operator()(const framework::OpDesc& desc,
                           bool use_no_calib_int8 = false,
                           bool with_dynamic_shape = false,
+                          bool forbid_dynamic_op_enter_into_trt = false,
                           bool use_explicit_quantization = false) = 0;
 
   virtual ~Teller() = default;
@@ -77,6 +78,7 @@ class OpTeller {
   bool Tell(const framework::ir::Node* node,
             bool use_no_calib_int8 = false,
             bool with_dynamic_shape = false,
+            bool forbid_dynamic_op_enter_into_trt = false,
             bool use_explicit_quantization = false);
 
   std::unique_ptr<Teller>& GetDefaultTeller() { return tellers_.at(0); }
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 708866b0bac34..69cb7303ea4e8 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -928,6 +928,7 @@ void BindAnalysisConfig(py::module *m) {
       .def("enable_tuned_tensorrt_dynamic_shape",
            &AnalysisConfig::EnableTunedTensorRtDynamicShape,
            py::arg("shape_range_info_path") = "",
+
            py::arg("allow_build_at_runtime") = true)
       .def("tuned_tensorrt_dynamic_shape",
            &AnalysisConfig::tuned_tensorrt_dynamic_shape)
@@ -936,6 +937,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
       .def("exp_disable_tensorrt_subgraph",
            &AnalysisConfig::Exp_DisableTensorRtSubgraph)
+      .def("exp_disable_tensorrt_dynamic_shape_ops",
+           &AnalysisConfig::Exp_DisableTensorRTDynamicShapeOPs)
       .def("enable_tensorrt_dla",
            &AnalysisConfig::EnableTensorRtDLA,
            py::arg("dla_core") = 0)
diff --git a/test/ir/inference/test_forbid_dynamic_op_api.py b/test/ir/inference/test_forbid_dynamic_op_api.py
new file mode 100644
index 0000000000000..51521e7889775
--- /dev/null
+++ b/test/ir/inference/test_forbid_dynamic_op_api.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import nn, static
+from paddle.inference import Config, PrecisionType, create_predictor
+
+paddle.enable_static()
+
+
+class SimpleNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2D(
+            in_channels=4,
+            out_channels=4,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+        self.relu1 = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            in_channels=4,
+            out_channels=2,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+        self.relu2 = nn.ReLU()
+        self.conv3 = nn.Conv2D(
+            in_channels=2,
+            out_channels=1,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+        self.relu3 = nn.ReLU()
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(729, 10)
+        self.softmax = nn.Softmax()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        x = self.relu3(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        x = self.softmax(x)
+        return x
+
+
+class TestTRTOptimizationLevel(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(self.temp_dir.name, 'optimization_level', '')
+        self.model_prefix = self.path + 'infer_model'
+
+    def tearDown(self):
+        shutil.rmtree(self.path)
+
+    def build_model(self):
+        image = static.data(
+            name='img', shape=[None, 4, 224, 224], dtype='float32'
+        )
+        predict = SimpleNet()(image)
+        exe = paddle.static.Executor(self.place)
+        exe.run(paddle.static.default_startup_program())
+        paddle.static.save_inference_model(
+            self.model_prefix, [image], [predict], exe
+        )
+
+    def init_predictor(self):
+        config = Config(
+            self.model_prefix + '.pdmodel', self.model_prefix + '.pdiparams'
+        )
+        config.enable_use_gpu(256, 0, PrecisionType.Half)
+        config.enable_tensorrt_engine(
+            workspace_size=1 << 30,
+            max_batch_size=1,
+            min_subgraph_size=3,
+            precision_mode=PrecisionType.Half,
+            use_static=False,
+            use_calib_mode=False,
+        )
+        config.enable_memory_optim()
+        config.exp_disable_tensorrt_dynamic_shape_ops(True)
+        config.disable_glog_info()
+        config.set_tensorrt_optimization_level(0)
+        self.assertEqual(config.tensorrt_optimization_level(), 0)
+        predictor = create_predictor(config)
+        return predictor
+
+    def infer(self, predictor, img):
+        input_names = predictor.get_input_names()
+        for i, name in enumerate(input_names):
+            input_tensor = predictor.get_input_handle(name)
+            input_tensor.reshape(img[i].shape)
+            input_tensor.copy_from_cpu(img[i].copy())
+        predictor.run()
+        results = []
+        output_names = predictor.get_output_names()
+        for i, name in enumerate(output_names):
+            output_tensor = predictor.get_output_handle(name)
+            output_data = output_tensor.copy_to_cpu()
+            results.append(output_data)
+        return results
+
+    def test_optimization_level(self):
+        self.build_model()
+        predictor = self.init_predictor()
+        img = np.ones((1, 4, 224, 224), dtype=np.float32)
+        results = self.infer(predictor, img=[img])
+
+
+if __name__ == '__main__':
+    unittest.main()

From c3ca9a983a75458ca351f6aa7ac34259f811a906 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 6 Mar 2024 14:07:44 +0800
Subject: [PATCH 014/114] [PIR+CINN]Fix cinn_op.concat infer shape bug for
 dynamic shape (#62421)

* [PIR+CINN]Fix cinn_op.concat infer shape bug for dynamic shape

* fix typo
---
 .../hlir/dialect/operator/ir/manual_op.cc     | 71 +++++++++----------
 1 file changed, 32 insertions(+), 39 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
index ae62fc46cf354..0def6a8491e9e 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -33,6 +33,8 @@
 namespace cinn {
 namespace dialect {
 
+using DenseTensorType = paddle::dialect::DenseTensorType;
+
 const char* GroupOp::attributes_name[GroupOp::attributes_num] = {"group_info"};
 const char* FusionOp::attributes_name[GroupOp::attributes_num] = {"group_info"};
 const char* ConcatOp::attributes_name[ConcatOp::attributes_num] = {"axis"};
@@ -200,39 +202,31 @@ void ConcatOp::Build(pir::Builder& builder,             // NOLINT
                     phi::errors::InvalidArgument(
                         "input size [%d] is less than 0", inputs.size()));
 
-  auto first_ele =
-      inputs[0].type().dyn_cast<paddle::dialect::DenseTensorType>();
-  phi::DDim out_dims = first_ele.dims();
-
-  if (axis < 0) {
-    axis += out_dims.size();
-  }
-
-  for (size_t idx = 0; idx < inputs.size(); ++idx) {
-    inputs_type[idx] = inputs[idx].type();
-
-    if (idx > 0) {
-      auto dim_i = inputs[idx]
-                       .type()
-                       .dyn_cast<paddle::dialect::DenseTensorType>()
-                       .dims();
-
-      out_dims[axis] += dim_i[axis];
+  const pir::Type out_type = [&]() {
+    auto first_ele = inputs[0].type().dyn_cast<DenseTensorType>();
+    phi::DDim out_dims = first_ele.dims();
+    if (axis < 0) axis += out_dims.size();
+
+    for (size_t idx = 1; idx < inputs.size(); ++idx) {
+      inputs_type[idx] = inputs[idx].type();
+      auto dim_i = inputs[idx].type().dyn_cast<DenseTensorType>().dims();
+
+      if (out_dims[axis] > 0 && dim_i[axis] > 0) {
+        out_dims[axis] += dim_i[axis];
+      } else {
+        out_dims[axis] = -1;
+        break;
+      }
     }
-  }
-
-  auto out_type =
-      paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                            first_ele.dtype(),
-                                            out_dims,
-                                            first_ele.data_layout(),
-                                            first_ele.lod(),
-                                            first_ele.offset());
-
+    return DenseTensorType::get(pir::IrContext::Instance(),
+                                first_ele.dtype(),
+                                out_dims,
+                                first_ele.data_layout(),
+                                first_ele.lod(),
+                                first_ele.offset());
+  }();
   argument.output_types.emplace_back(out_type);
-
   PassStopGradientsDefaultly(argument);
-
   argument.AddAttribute(
       "axis", pir::Int32Attribute::get(pir::IrContext::Instance(), axis));
 }
@@ -248,7 +242,7 @@ void SplitOp::Build(pir::Builder& builder,             // NOLINT
 
   std::vector<pir::Type> output_type(sections.size());
 
-  auto input_ele = input.type().dyn_cast<paddle::dialect::DenseTensorType>();
+  auto input_ele = input.type().dyn_cast<DenseTensorType>();
 
   if (axis < 0) {
     axis += input_ele.dims().size();
@@ -257,13 +251,12 @@ void SplitOp::Build(pir::Builder& builder,             // NOLINT
   for (size_t idx = 0; idx < sections.size(); ++idx) {
     auto out_dims = input_ele.dims();
     out_dims[axis] = sections[idx];
-    auto out_type =
-        paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              input_ele.dtype(),
-                                              out_dims,
-                                              input_ele.data_layout(),
-                                              input_ele.lod(),
-                                              input_ele.offset());
+    auto out_type = DenseTensorType::get(pir::IrContext::Instance(),
+                                         input_ele.dtype(),
+                                         out_dims,
+                                         input_ele.data_layout(),
+                                         input_ele.lod(),
+                                         input_ele.offset());
 
     argument.output_types.emplace_back(out_type);
 
@@ -309,7 +302,7 @@ void GenerateShapeOp::Build(
     auto type = pir::Int64Type::get(ctx);
     auto dim =
         ::common::make_ddim({static_cast<int64_t>(output_dim_exprs.size())});
-    return paddle::dialect::DenseTensorType::get(ctx, type, dim);
+    return DenseTensorType::get(ctx, type, dim);
   }()});
   ::pir::PassStopGradientsDefaultly(argument);
 }

From c7b3acfae3db2372788ef4b7ca2c3cc591982bb8 Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Wed, 6 Mar 2024 14:08:53 +0800
Subject: [PATCH 015/114] fix group copy (#62409)

---
 .../hlir/dialect/operator/transforms/add_cinn_pass.cc    | 1 -
 paddle/cinn/hlir/framework/pir/group.cc                  | 9 +++++++++
 test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py         | 4 ++--
 test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py         | 4 ++--
 test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py         | 4 ++--
 test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py | 4 ++--
 test/ir/pir/cinn/symbolic/test_llama_mlp_dy.py           | 4 ++--
 7 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
index ec7191e171937..91bfad2d5710d 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -134,7 +134,6 @@ void ApplyGroupOpPass(::pir::Program* program,
 
   pass_manager->AddPass(cinn::dialect::ir::CreateDynamicReshapeOpPass());
   pass_manager->AddPass(cinn::dialect::ir::CreateReplaceDynamicExpandOpPass());
-
   pass_manager->AddPass(pir::CreateDeadCodeEliminationPass());
 
   pass_manager->Run(program);
diff --git a/paddle/cinn/hlir/framework/pir/group.cc b/paddle/cinn/hlir/framework/pir/group.cc
index 706dfcafd6819..7cef409f9cad2 100644
--- a/paddle/cinn/hlir/framework/pir/group.cc
+++ b/paddle/cinn/hlir/framework/pir/group.cc
@@ -50,6 +50,15 @@ std::shared_ptr<Group> Group::Clone(::pir::Block* target_block,
     new_group->output_values.push_back(ir_mapping.Lookup(output_value));
   }
 
+  new_group->input_names = this->input_names;
+  new_group->output_names = this->output_names;
+  new_group->output_values = this->output_values;
+  new_group->fn_name = this->fn_name;
+  new_group->int_args_map = this->int_args_map;
+  new_group->alignment_schedule_info = this->alignment_schedule_info;
+  new_group->reduce_axis = this->reduce_axis;
+  new_group->loop_ranges = this->loop_ranges;
+
   return new_group;
 }
 
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
index 9d7c757cafa42..eeeca452b5e97 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_70.py
@@ -81,5 +81,5 @@ def test_ast_prim_cinn(self):
             np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
 
 
-# if __name__ == '__main__':
-#     unittest.main()
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
index 971bca1d02fb7..69b7847f2a096 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_79.py
@@ -107,5 +107,5 @@ def test_ast_prim_cinn(self):
             np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-6)
 
 
-# if __name__ == '__main__':
-#     unittest.main()
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
index dace08b921f7c..32a9ece2de252 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_88.py
@@ -88,5 +88,5 @@ def test_ast_prim_cinn(self):
             np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-6)
 
 
-# if __name__ == '__main__':
-#     unittest.main()
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
index 10fe8bd9e9b81..d2e5f900b20f3 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_mul_method.py
@@ -69,5 +69,5 @@ def test_ast_prim_cinn(self):
             np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
 
 
-# if __name__ == '__main__':
-#     unittest.main()
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_llama_mlp_dy.py b/test/ir/pir/cinn/symbolic/test_llama_mlp_dy.py
index 1b3af40308270..96cbbd8076702 100644
--- a/test/ir/pir/cinn/symbolic/test_llama_mlp_dy.py
+++ b/test/ir/pir/cinn/symbolic/test_llama_mlp_dy.py
@@ -88,5 +88,5 @@ def test_eval(self):
         )
 
 
-# if __name__ == '__main__':
-#     unittest.main()
+if __name__ == '__main__':
+    unittest.main()

From 376aba57d0378c131d79d4d84d766637506b4cba Mon Sep 17 00:00:00 2001
From: xingmingyyj <135400902+xingmingyyj@users.noreply.github.com>
Date: Wed, 6 Mar 2024 14:14:28 +0800
Subject: [PATCH 016/114] [PIR] Add op_callstack to Pir (#62139)

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>
---
 paddle/cinn/hlir/framework/pir/utils.cc       |  11 +-
 .../pir/dialect/op_generator/python_c_gen.py  |  12 +-
 paddle/fluid/pybind/CMakeLists.txt            |   3 +-
 .../fluid/pybind/manual_static_op_function.h  |  66 +++++++++--
 paddle/fluid/pybind/op_callstack_utils.cc     | 104 ++++++++++++++++++
 paddle/fluid/pybind/op_callstack_utils.h      |  31 ++++++
 6 files changed, 210 insertions(+), 17 deletions(-)
 create mode 100644 paddle/fluid/pybind/op_callstack_utils.cc
 create mode 100644 paddle/fluid/pybind/op_callstack_utils.h

diff --git a/paddle/cinn/hlir/framework/pir/utils.cc b/paddle/cinn/hlir/framework/pir/utils.cc
index 47a451cba9bb1..741c81d46463f 100644
--- a/paddle/cinn/hlir/framework/pir/utils.cc
+++ b/paddle/cinn/hlir/framework/pir/utils.cc
@@ -465,9 +465,16 @@ static utils::Attribute ConvertArrayAttribute(
         CASE_ATTRIBUTE(float, FloatAttribute)
       } else if (attr_vec[0].isa<::pir::DoubleAttribute>()) {
         CASE_ATTRIBUTE(double, DoubleAttribute)
+      } else if (attr_vec[0].isa<::pir::StrAttribute>()) {
+        std::vector<std::string> dst_attr;
+        for (auto element : attr_vec) {
+          dst_attr.push_back(
+              element.dyn_cast<::pir::StrAttribute>().AsString());
+        }
       } else {
-        LOG(FATAL) << "only support bool/int32/int64/float/double attribute in "
-                      "ArrayAttribute";
+        LOG(FATAL)
+            << "only support bool/int32/int64/float/double/string attribute in "
+               "ArrayAttribute";
       }
     }
   } else if (src_attr.isa<::pir::shape::SymbolAttribute>()) {
diff --git a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
index 38619ec22e049..970f4d00205a4 100644
--- a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
@@ -52,6 +52,7 @@
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/fluid/pybind/op_callstack_utils.h"
 
 
 {body}
@@ -71,8 +72,10 @@
         {attrs}
 
         // Call ir static api
+        CallStackRecorder callstack_recoder("{api_name}");
+        callstack_recoder.Record();
         auto static_api_out = paddle::dialect::{api_name}({args});
-
+        callstack_recoder.AttachToOps();
         return ToPyObject(static_api_out);
     }} catch (...) {{
         ThrowExceptionToPython(std::current_exception());
@@ -94,8 +97,10 @@
         {attrs}
 
         // Call ir static api
+        CallStackRecorder callstack_recoder("{api_name}");
+        callstack_recoder.Record();
         paddle::dialect::{api_name}({args});
-
+        callstack_recoder.AttachToOps();
         return nullptr;
     }} catch (...) {{
         ThrowExceptionToPython(std::current_exception());
@@ -129,7 +134,10 @@
         {cast_attrs}
 
         // Call ir static api
+        CallStackRecorder callstack_recoder("{api_name}");
+        callstack_recoder.Record();
         auto static_api_out = paddle::dialect::{api_name}({args_with_mutable_attrs});
+        callstack_recoder.AttachToOps();
         return ToPyObject(static_api_out);
 
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index f67a74bf3f8ae..c842b62017219 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -151,7 +151,8 @@ set(PYBIND_SRCS
     auto_parallel_py.cc
     eval_frame_tools.cc
     cpython_internals.c
-    eval_frame.c)
+    eval_frame.c
+    op_callstack_utils.cc)
 
 if(NOT WITH_SHARED_IR)
   # Note: We want to compile pir source into paddle.so directly, because
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index ced41e6905e5c..ccb527aeecdcb 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -24,6 +24,7 @@
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/op_callstack_utils.h"
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/enforce.h"
@@ -43,8 +44,10 @@ static PyObject *static_api_parameter(PyObject *self,
     PyObject *name_obj = PyTuple_GET_ITEM(args, 0);
     std::string name = CastPyArg2String(name_obj, "name", 0);
     // Call ir static api
+    CallStackRecorder callstack_recoder("parameter");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::parameter(name);
-
+    callstack_recoder.AttachToOps();
     return ToPyObject(static_api_out);
   } catch (...) {
     ThrowExceptionToPython(std::current_exception());
@@ -67,8 +70,10 @@ static PyObject *static_api_set_parameter(PyObject *self,
     PyObject *name_obj = PyTuple_GET_ITEM(args, 1);
     std::string name = CastPyArg2String(name_obj, "name", 1);
     // Call ir static api
+    CallStackRecorder callstack_recoder("set_parameter");
+    callstack_recoder.Record();
     paddle::dialect::set_parameter(parameter, name);
-
+    callstack_recoder.AttachToOps();
     Py_RETURN_NONE;
   } catch (...) {
     ThrowExceptionToPython(std::current_exception());
@@ -91,8 +96,10 @@ static PyObject *static_api_set_persistable_value(PyObject *self,
     PyObject *name_obj = PyTuple_GET_ITEM(args, 1);
     std::string name = CastPyArg2String(name_obj, "name", 1);
     // Call ir static api
+    CallStackRecorder callstack_recoder("shadow_output");
+    callstack_recoder.Record();
     paddle::dialect::shadow_output(persist_value, name);
-
+    callstack_recoder.AttachToOps();
     Py_RETURN_NONE;
   } catch (...) {
     ThrowExceptionToPython(std::current_exception());
@@ -119,7 +126,10 @@ PyObject *static_api_full(PyObject *self, PyObject *args, PyObject *kwargs) {
         !PyObject_CheckIRValue(value_obj)) {
       std::vector<int64_t> shape = CastPyArg2Longs(shape_obj, "full", 0);
       float value = CastPyArg2Float(value_obj, "full", 1);
+      CallStackRecorder callstack_recoder("full");
+      callstack_recoder.Record();
       auto static_api_out = paddle::dialect::full(shape, value, dtype, place);
+      callstack_recoder.AttachToOps();
       return ToPyObject(static_api_out);
     } else {
       pir::Value shape, value;
@@ -146,8 +156,12 @@ PyObject *static_api_full(PyObject *self, PyObject *args, PyObject *kwargs) {
                                       phi::CPUPlace());
       }
 
+      CallStackRecorder callstack_recoder("full_with_tensor");
+      callstack_recoder.Record();
       auto static_api_out =
           paddle::dialect::full_with_tensor(shape, value, dtype);
+      callstack_recoder.AttachToOps();
+
       return ToPyObject(static_api_out);
     }
   } catch (...) {
@@ -169,7 +183,10 @@ static PyObject *static_api_create_array(PyObject *self,
         CastPyArg2DataTypeDirectly(dtype_obj, "create_array", 0);
 
     // Call ir static api
+    CallStackRecorder callstack_recoder("create_array");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::create_array(dtype);
+    callstack_recoder.AttachToOps();
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -194,8 +211,10 @@ static PyObject *static_api_create_array_like(PyObject *self,
     float value = CastPyArg2Float(value_obj, "create_array_like", 1);
 
     // Call ir static api
+    CallStackRecorder callstack_recoder("create_array_like");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::create_array_like(input, value);
-
+    callstack_recoder.AttachToOps();
     return ToPyObject(static_api_out);
   } catch (...) {
     ThrowExceptionToPython(std::current_exception());
@@ -215,7 +234,10 @@ static PyObject *static_api_array_length(PyObject *self,
     auto x = CastPyArg2Value(x_obj, "array_length", 0);
 
     // Call ir static api
+    CallStackRecorder callstack_recoder("array_length");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::array_length(x);
+    callstack_recoder.AttachToOps();
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -248,7 +270,10 @@ static PyObject *static_api_array_read(PyObject *self,
     }
 
     // Call ir static api
+    CallStackRecorder callstack_recoder("array_read");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::array_read(array, i);
+    callstack_recoder.AttachToOps();
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -282,7 +307,10 @@ static PyObject *static_api_array_write_(PyObject *self,
     }
 
     // Call ir static api
+    CallStackRecorder callstack_recoder("array_write_");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::array_write_(array, x, i);
+    callstack_recoder.AttachToOps();
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -321,7 +349,10 @@ static PyObject *static_api_array_to_tensor(PyObject *self,
     auto use_stack = CastPyArg2Boolean(use_stack_obj, "array_to_tensor", 2);
 
     // Call ir static api
+    CallStackRecorder callstack_recoder("array_to_tensor");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::array_to_tensor(x, axis, use_stack);
+    callstack_recoder.AttachToOps();
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -341,10 +372,10 @@ PyObject *static_api_add_n_array(PyObject *self,
     PyObject *inputs_obj = PyTuple_GET_ITEM(args, 0);
     auto inputs = CastPyArg2VectorOfValue(inputs_obj, "add_n", 0);
 
-    // Parse Attributes
-
-    // Call ir static api
+    CallStackRecorder callstack_recoder("add_n_array");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::add_n_array(inputs);
+    callstack_recoder.AttachToOps();
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -395,7 +426,10 @@ static PyObject *static_api_slice_array(PyObject *self,
     }
 
     // Call ir static api
+    CallStackRecorder callstack_recoder("slice_array");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::slice_array(input, starts, ends);
+    callstack_recoder.AttachToOps();
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -430,9 +464,11 @@ static PyObject *static_api_slice_array_dense(PyObject *self,
       starts = paddle::dialect::full_int_array(
           starts_tmp, phi::DataType::INT64, phi::CPUPlace());
     }
-
     // Call ir static api
+    CallStackRecorder callstack_recoder("slice_array_dense");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::slice_array_dense(input, starts);
+    callstack_recoder.AttachToOps();
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -754,7 +790,8 @@ static PyObject *static_api_run_custom_op(PyObject *self,
 
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
   ::pir::PassStopGradientsDefaultly(argument);
-
+  CallStackRecorder callstack_recoder("run_custom_op");
+  callstack_recoder.Record();
   std::vector<pir::Value> op_results;
   pir::Operation *op =
       paddle::dialect::ApiBuilder::Instance().GetBuilder()->Build(
@@ -772,7 +809,7 @@ static PyObject *static_api_run_custom_op(PyObject *self,
       op_results.push_back(op->result(i));
     }
   }
-
+  callstack_recoder.AttachToOps();
   return ToPyObject(op_results);
 }
 
@@ -811,10 +848,13 @@ static PyObject *static_api_fused_gemm_epilogue(PyObject *self,
     PyObject *activation_obj = PyTuple_GET_ITEM(args, 5);
     std::string activation =
         CastPyArg2String(activation_obj, "fused_gemm_epilogue", 5);
-
     // Call ir static api
+    CallStackRecorder callstack_recoder("fused_gemm_epilogue");
+    callstack_recoder.Record();
     auto out = paddle::dialect::fused_gemm_epilogue(
         x, y, bias, trans_x, trans_y, activation);
+    callstack_recoder.AttachToOps();
+
     return ToPyObject(out);
   } catch (...) {
     ThrowExceptionToPython(std::current_exception());
@@ -836,8 +876,10 @@ static PyObject *static_api_array_pop(PyObject *self,
     auto index = CastPyArg2Int(index_obj, "array_pop", 1);
 
     // Call ir static api
+    CallStackRecorder callstack_recoder("array_pop");
+    callstack_recoder.Record();
     auto static_api_out = paddle::dialect::array_pop(input, index);
-
+    callstack_recoder.AttachToOps();
     return ToPyObject(static_api_out);
   } catch (...) {
     ThrowExceptionToPython(std::current_exception());
diff --git a/paddle/fluid/pybind/op_callstack_utils.cc b/paddle/fluid/pybind/op_callstack_utils.cc
new file mode 100644
index 0000000000000..1e8e2c1630cd9
--- /dev/null
+++ b/paddle/fluid/pybind/op_callstack_utils.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <Python.h>
+#include <frameobject.h>
+
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/op_callstack_utils.h"
+
+pir::Attribute CallStackRecorder::GetOpCallstackInfo() {
+  PyObject* traceback_str = PyUnicode_FromString("traceback");
+  PyObject* traceback_module = PyImport_Import(traceback_str);
+
+  if (NULL == traceback_module) {
+    Py_DECREF(traceback_str);
+    Py_DECREF(traceback_module);
+    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "Failed to import traceback module while getting callstack information "
+        "for %s.",
+        api_name_));
+  }
+  PyObject* tb = PyObject_GetAttrString(traceback_module, "extract_stack");
+  PyObject* stack = PyObject_CallObject(tb, NULL);
+  if (NULL == stack) {
+    Py_DECREF(tb);
+    Py_DECREF(traceback_str);
+    Py_DECREF(traceback_module);
+    PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "Failed to get callstack object while getting callstack information "
+        "for "
+        "%s.",
+        api_name_));
+  }
+  Py_ssize_t stack_size = PyList_Size(stack);
+  std::vector<pir::Attribute> op_callstack_infos;
+  for (Py_ssize_t i = 0; i < stack_size; ++i) {
+    PyObject* frame_summary = PyList_GetItem(stack, i);
+    PyObject* filename = PyObject_GetAttrString(frame_summary, "filename");
+    PyObject* lineno = PyObject_GetAttrString(frame_summary, "lineno");
+    PyObject* name = PyObject_GetAttrString(frame_summary, "name");
+    PyObject* line = PyObject_GetAttrString(frame_summary, "line");
+    PyObject* callstack_info = PyUnicode_FromFormat(
+        "  File \"%S\", line %S, in %S", filename, lineno, name);
+    PyObject* callstack_source_line = PyUnicode_FromFormat("    %S", line);
+    op_callstack_infos.push_back(
+        pir::StrAttribute::get(pir::IrContext::Instance(),
+                               std::string(PyUnicode_AsUTF8(callstack_info))));
+    op_callstack_infos.push_back(pir::StrAttribute::get(
+        pir::IrContext::Instance(),
+        std::string(PyUnicode_AsUTF8(callstack_source_line))));
+    Py_DECREF(callstack_info);
+    Py_DECREF(callstack_source_line);
+    Py_DECREF(filename);
+    Py_DECREF(lineno);
+    Py_DECREF(name);
+    Py_DECREF(line);
+  }
+  Py_DECREF(tb);
+  Py_DECREF(traceback_str);
+  Py_DECREF(traceback_module);
+  return pir::ArrayAttribute::get(pir::IrContext::Instance(),
+                                  op_callstack_infos);
+}
+
+void CallStackRecorder::Record() {
+  auto before_insertion_point =
+      paddle::dialect::ApiBuilder::Instance().GetCurrentInsertionPoint();
+  before_insertion_iterator_ = (--before_insertion_point.second);
+  before_insertion_block_ = before_insertion_point.first;
+}
+
+void CallStackRecorder::AttachToOps() {
+  before_insertion_iterator_++;
+  pir::Attribute callstack_info_attr = GetOpCallstackInfo();
+  pir::InsertionPoint after_insertion_point =
+      paddle::dialect::ApiBuilder::Instance().GetCurrentInsertionPoint();
+  PADDLE_ENFORCE_EQ(before_insertion_block_,
+                    after_insertion_point.first,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "The block obtained before and after calling the "
+                        "static API %s is inconsistent.",
+                        api_name_));
+  auto after_insertion_iterator = after_insertion_point.second;
+  for (auto block_iterator = before_insertion_iterator_;
+       block_iterator != after_insertion_iterator;
+       block_iterator++) {
+    block_iterator->set_attribute(paddle::framework::OpProtoAndCheckerMaker::
+                                      OpCreationCallstackAttrName(),
+                                  callstack_info_attr);
+  }
+}
diff --git a/paddle/fluid/pybind/op_callstack_utils.h b/paddle/fluid/pybind/op_callstack_utils.h
new file mode 100644
index 0000000000000..a380fd37619b6
--- /dev/null
+++ b/paddle/fluid/pybind/op_callstack_utils.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/pir/include/core/block.h"
+#include "paddle/pir/include/core/builtin_attribute.h"
+
+class CallStackRecorder {
+ public:
+  explicit CallStackRecorder(const std::string& api_name)
+      : api_name_(api_name), before_insertion_block_(nullptr) {}
+  pir::Attribute GetOpCallstackInfo();
+  void Record();
+  void AttachToOps();
+
+ private:
+  const std::string& api_name_;
+  pir::Block::Iterator before_insertion_iterator_;
+  pir::Block* before_insertion_block_;
+};

From c870186308a4ad62f9780e8ca81a850333b6435d Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Wed, 6 Mar 2024 14:24:32 +0800
Subject: [PATCH 017/114] [Auto Parallel] Add gather spmd rule (#62097)

* add gather forward spmd rule

* add unit test of gather_spmd to CMakeList
---
 paddle/phi/infermeta/spmd_rules/gather.cc     | 178 +++++++++++++++
 paddle/phi/infermeta/spmd_rules/gather.h      |  44 ++++
 paddle/phi/infermeta/spmd_rules/rules.cc      |   6 +
 paddle/phi/infermeta/spmd_rules/rules.h       |   1 +
 paddle/phi/infermeta/spmd_rules/scatter.cc    |   3 +-
 .../spmd_rules/spmd_rule_macro_define.h       |  50 ++---
 test/auto_parallel/spmd_rules/CMakeLists.txt  |   2 +
 .../spmd_rules/test_gather_rule.py            | 209 ++++++++++++++++++
 8 files changed, 467 insertions(+), 26 deletions(-)
 create mode 100644 paddle/phi/infermeta/spmd_rules/gather.cc
 create mode 100644 paddle/phi/infermeta/spmd_rules/gather.h
 create mode 100644 test/auto_parallel/spmd_rules/test_gather_rule.py

diff --git a/paddle/phi/infermeta/spmd_rules/gather.cc b/paddle/phi/infermeta/spmd_rules/gather.cc
new file mode 100644
index 0000000000000..c8fae74253e8c
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/gather.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/gather.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+
+using phi::distributed::auto_parallel::str_join;
+
+SpmdInfo GatherInferSpmdBase(const DistMetaTensor& x,
+                             const DistMetaTensor& index,
+                             int axis) {
+  // Step0: Verify Input Args Based on Gather Logic
+  // extract and check x_ndim, x_shape, x_dist_attr_src and
+  // x_dims_mapping_src with the macro
+  EXTRACT_SHAPE_AND_DIST_ATTR(x);
+  // index may be 0-d tensor, verify it specifically
+  auto index_shape = common::vectorize(index.dims());
+  int index_ndim = index_shape.size();
+  TensorDistAttr index_dist_attr_src = index.dist_attr();
+  std::vector<int64_t> index_dims_mapping_src =
+      index_dist_attr_src.dims_mapping();
+  if (index_ndim == 0) {
+    PADDLE_ENFORCE_EQ(index_dims_mapping_src.size(),
+                      1,
+                      phi::errors::InvalidArgument(
+                          "index is 0-d tensor, it's dims_mapping size "
+                          "must be 1, but received [%d]",
+                          index_dims_mapping_src.size()));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        index_ndim,
+        index_dims_mapping_src.size(),
+        phi::errors::InvalidArgument("Tensor index's rank [%d] and "
+                                     "dims_mapping size [%d] are not matched.",
+                                     index_ndim,
+                                     index_dims_mapping_src.size()));
+  }
+
+  // Step1: Build Einsum Notation
+  std::string alphabet = "abcdefghijlmnopqrstuvwxyz";
+  std::string x_axes = GetBroadcastAxes(x_ndim, x_ndim, alphabet);
+  std::string index_axes = "k";
+  std::string out_axes = x_axes;
+  if (index_ndim == 0) {
+    if (axis < x_ndim) {
+      out_axes.erase(axis, 1);
+    }
+    index_axes = "";
+  } else {
+    out_axes[axis] = 'k';
+  }
+
+  // Step2: Sharding Propogation
+  // Step2.1: Merge input shardings
+  std::vector<int64_t> x_dims_mapping(x_dims_mapping_src);
+  if (axis < x_ndim) {
+    x_dims_mapping[axis] = -1;
+  }
+  std::vector<int64_t> index_dims_mapping(index_dims_mapping_src);
+  if (index_ndim == 0) {
+    index_dims_mapping[0] = -1;
+  }
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors(
+          {{x_axes, x_dims_mapping}, {index_axes, index_dims_mapping}});
+
+  TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping);
+
+  TensorDistAttr index_dist_attr_dst =
+      CopyTensorDistAttrForOutput(index_dist_attr_src);
+  index_dist_attr_dst.set_dims_mapping(index_dims_mapping);
+
+  // Step2.2: Infer output dims mapping
+  std::vector<int64_t> out_dims_mapping =
+      GetDimsMappingForAxes(out_axes, axis_to_dim_map);
+  TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  out_dist_attr.set_dims_mapping(out_dims_mapping);
+
+  VLOG(4) << "x_axes: " << x_axes << " index_axes: " << index_axes
+          << " out_axes: " << out_axes;
+  LOG_SPMD_INPUT(x);
+  LOG_SPMD_INPUT(index);
+  VLOG(4) << "out";
+  VLOG(4) << "dist_attr: [" << out_dist_attr.to_string() << "]";
+  return {{x_dist_attr_dst, index_dist_attr_dst}, {out_dist_attr}};
+}
+
+SpmdInfo GatherInferSpmdReverseBase(const DistMetaTensor& x,
+                                    const DistMetaTensor& index,
+                                    const DistMetaTensor& out,
+                                    int axis) {
+  // Step0: Verify Input Args Based on Gather Logic
+  // extract and check out_ndim, out_shape, out_dist_attr_src and
+  // out_dims_mapping_src with the macro
+  EXTRACT_SHAPE_AND_DIST_ATTR(x);
+  EXTRACT_SHAPE_AND_DIST_ATTR(index);
+  EXTRACT_SHAPE_AND_DIST_ATTR(out);
+
+  // Step1: Build Einsum Notation
+  std::string alphabet = "abcdefghijlmnopqrstuvwxyz";
+  // x should be replicated on 0th axis
+  std::string x_axes = GetBroadcastAxes(x_ndim, x_ndim, alphabet);
+  std::string index_axes = "k";
+  std::string out_axes = x_axes;
+  if (index_ndim == 0) {
+    index_axes = "";
+    if (axis < x_ndim) {
+      out_axes.erase(axis, 1);
+    }
+  } else {
+    out_axes[axis] = 'k';
+  }
+
+  // Step2: Sharding Propogation
+  // Step2.1: Merge output shardings
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors({{out_axes, out_dims_mapping_src}});
+
+  // Step2.2: Infer input dims mapping
+  std::vector<int64_t> x_dims_mapping =
+      GetDimsMappingForAxes(x_axes, axis_to_dim_map, true);
+  if (axis < x_ndim) {
+    x_dims_mapping[axis] = -1;
+  }
+  std::vector<int64_t> index_dims_mapping =
+      GetDimsMappingForAxes(index_axes, axis_to_dim_map, true);
+  TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping);
+  TensorDistAttr index_dist_attr_dst =
+      CopyTensorDistAttrForOutput(index_dist_attr_src);
+  index_dist_attr_dst.set_dims_mapping(index_dims_mapping);
+
+  VLOG(4) << "out_axes: " << out_axes << " x_axes: " << x_axes
+          << " index_axes: " << index_axes;
+  VLOG(4) << "out dist_attr: [" << out_dist_attr_src.to_string() << "]";
+  LOG_SPMD_INPUT(x);
+  LOG_SPMD_INPUT(index);
+  VLOG(4) << std::endl;
+  return {{x_dist_attr_dst, index_dist_attr_dst}, {out_dist_attr_src}};
+}
+
+SpmdInfo GatherInferSpmdDynamic(const DistMetaTensor& x,
+                                const DistMetaTensor& index,
+                                const Scalar& axis) {
+  return GatherInferSpmdBase(x, index, axis.to<int32_t>());
+}
+
+SpmdInfo GatherInferSpmdReverseDynamic(const DistMetaTensor& x,
+                                       const DistMetaTensor& index,
+                                       const DistMetaTensor& out,
+                                       const Scalar& axis) {
+  return GatherInferSpmdReverseBase(x, index, out, axis.to<int32_t>());
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/gather.h b/paddle/phi/infermeta/spmd_rules/gather.h
new file mode 100644
index 0000000000000..c3a12941cdb19
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/gather.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+SpmdInfo GatherInferSpmdBase(const DistMetaTensor& x,
+                             const DistMetaTensor& index,
+                             int axis);
+
+SpmdInfo GatherInferSpmdReverseBase(const DistMetaTensor& x,
+                                    const DistMetaTensor& index,
+                                    const DistMetaTensor& out,
+                                    int axis);
+
+SpmdInfo GatherInferSpmdDynamic(const DistMetaTensor& x,
+                                const DistMetaTensor& index,
+                                const Scalar& axis);
+
+SpmdInfo GatherInferSpmdReverseDynamic(const DistMetaTensor& x,
+                                       const DistMetaTensor& index,
+                                       const DistMetaTensor& out,
+                                       const Scalar& axis);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.cc b/paddle/phi/infermeta/spmd_rules/rules.cc
index d8ba17971b6a9..bed16d398dcf0 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.cc
+++ b/paddle/phi/infermeta/spmd_rules/rules.cc
@@ -620,5 +620,11 @@ PD_REGISTER_SPMD_RULE(scatter,
                       PD_INFER_SPMD(phi::distributed::ScatterInferSpmd),
                       PD_INFER_SPMD(phi::distributed::ScatterInferSpmdReverse));
 
+// gather
+PD_REGISTER_SPMD_RULE(
+    gather,
+    PD_INFER_SPMD(phi::distributed::GatherInferSpmdBase),
+    PD_INFER_SPMD(phi::distributed::GatherInferSpmdReverseBase));
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index 805d20904c8a5..f3381ae2e806b 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/full_like.h"
 #include "paddle/phi/infermeta/spmd_rules/fused_linear_param_grad_add.h"
 #include "paddle/phi/infermeta/spmd_rules/fused_rope.h"
+#include "paddle/phi/infermeta/spmd_rules/gather.h"
 #include "paddle/phi/infermeta/spmd_rules/layer_norm.h"
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
 #include "paddle/phi/infermeta/spmd_rules/numel.h"
diff --git a/paddle/phi/infermeta/spmd_rules/scatter.cc b/paddle/phi/infermeta/spmd_rules/scatter.cc
index 98040cebfa741..ae29d5f059ba0 100644
--- a/paddle/phi/infermeta/spmd_rules/scatter.cc
+++ b/paddle/phi/infermeta/spmd_rules/scatter.cc
@@ -102,7 +102,7 @@ SpmdInfo ScatterInferSpmd(const DistMetaTensor& x,
   LOG_SPMD_INPUT(x);
   LOG_SPMD_INPUT(index);
   LOG_SPMD_INPUT(updates);
-  VLOG(4) << "Out dist_attr: [" << out_dist_attr.to_string() << "]";
+  VLOG(4) << "Out dist_attr: [" << out_dist_attr.to_string() << "]\n\n";
   return {{x_dist_attr_dst, index_dist_attr_dst, updates_dist_attr_dst},
           {out_dist_attr}};
 }
@@ -161,6 +161,7 @@ SpmdInfo ScatterInferSpmdReverse(const DistMetaTensor& x,
   LOG_SPMD_INPUT(x);
   LOG_SPMD_INPUT(index);
   LOG_SPMD_INPUT(updates);
+  VLOG(4) << std::endl;
   return {{x_dist_attr_dst, index_dist_attr_dst, updates_dist_attr_dst},
           {out_dist_attr_dst}};
 }
diff --git a/paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h b/paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h
index 65e90a5850614..43147db5b6194 100644
--- a/paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h
+++ b/paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h
@@ -16,33 +16,33 @@ limitations under the License. */
 
 using phi::distributed::auto_parallel::str_join;
 
-#define EXTRACT_SHAPE_AND_DIST_ATTR(x)                                      \
-  auto x##_shape = phi::vectorize(x.dims());                                \
-  int x##_ndim = x##_shape.size();                                          \
-  auto x##_dist_attr_src = x.dist_attr();                                   \
-  const auto& x##_dims_mapping_src = x##_dist_attr_src.dims_mapping();      \
-  PADDLE_ENFORCE_EQ(x##_ndim,                                               \
-                    x##_dims_mapping_src.size(),                            \
-                    phi::errors::InvalidArgument(                           \
-                        "[%d] [%d] The Tensor [%d]'s rank [%d] and Loss's " \
-                        "dims_mapping size [%d] are not matched.",          \
-                        __FILE__,                                           \
-                        __LINE__,                                           \
-                        #x,                                                 \
-                        x##_ndim,                                           \
+#define EXTRACT_SHAPE_AND_DIST_ATTR(x)                                 \
+  auto x##_shape = phi::vectorize(x.dims());                           \
+  int x##_ndim = x##_shape.size();                                     \
+  auto x##_dist_attr_src = x.dist_attr();                              \
+  const auto& x##_dims_mapping_src = x##_dist_attr_src.dims_mapping(); \
+  PADDLE_ENFORCE_EQ(x##_ndim,                                          \
+                    x##_dims_mapping_src.size(),                       \
+                    phi::errors::InvalidArgument(                      \
+                        "[%d] [%d] The Tensor [%d]'s rank [%d] and "   \
+                        "dims_mapping size [%d] are not matched.",     \
+                        __FILE__,                                      \
+                        __LINE__,                                      \
+                        #x,                                            \
+                        x##_ndim,                                      \
                         x##_dims_mapping_src.size()))
 
-#define EXTRACT_SHAPE_AND_DIST_ATTR_WITH_DIM_CK(x)                          \
-  EXTRACT_SHAPE_AND_DIST_ATTR(x);                                           \
-  PADDLE_ENFORCE_EQ(x##_ndim,                                               \
-                    x##_dims_mapping_src.size(),                            \
-                    phi::errors::InvalidArgument(                           \
-                        "[%d] [%d] The Tensor [%d]'s rank [%d] and Loss's " \
-                        "dims_mapping size [%d] are not matched.",          \
-                        __FILE__,                                           \
-                        __LINE__,                                           \
-                        #x,                                                 \
-                        x##_ndim,                                           \
+#define EXTRACT_SHAPE_AND_DIST_ATTR_WITH_DIM_CK(x)                   \
+  EXTRACT_SHAPE_AND_DIST_ATTR(x);                                    \
+  PADDLE_ENFORCE_EQ(x##_ndim,                                        \
+                    x##_dims_mapping_src.size(),                     \
+                    phi::errors::InvalidArgument(                    \
+                        "[%d] [%d] The Tensor [%d]'s rank [%d] and " \
+                        "dims_mapping size [%d] are not matched.",   \
+                        __FILE__,                                    \
+                        __LINE__,                                    \
+                        #x,                                          \
+                        x##_ndim,                                    \
                         x##_dims_mapping_src.size()))
 
 #define LOG_SPMD_INPUT(name)                                                  \
diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt
index d8c99d33a189f..06eece158a0c7 100644
--- a/test/auto_parallel/spmd_rules/CMakeLists.txt
+++ b/test/auto_parallel/spmd_rules/CMakeLists.txt
@@ -29,6 +29,8 @@ if(WITH_DISTRIBUTE)
   py_test_modules(test_tile_rule MODULES test_tile_rule)
   py_test_modules(test_fused_linear_param_grad_add_rule MODULES
                   test_fused_linear_param_grad_add_rule)
+  py_test_modules(test_scatter_rule MODULES test_scatter_rule)
+  py_test_modules(test_gather_rule MODULES test_gather_rule)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/spmd_rules/test_gather_rule.py b/test/auto_parallel/spmd_rules/test_gather_rule.py
new file mode 100644
index 0000000000000..14aae45aeb8f4
--- /dev/null
+++ b/test/auto_parallel/spmd_rules/test_gather_rule.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from collections import OrderedDict
+
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.distributed.fleet import auto
+from paddle.framework import core
+
+
+class TestScatterSPMDRule(unittest.TestCase):
+    """
+    Unit tests for scatter spmd rule.
+    """
+
+    def setUp(self):
+        x_shape = [64, 32, 48]
+        index_shape = [16]
+        updates_shape = [32, 32, 48]
+        process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        self.attrs = OrderedDict()
+        self.attrs['axis'] = 0
+        self.rule = core.get_phi_spmd_rule("gather")
+
+        x_dist_attr = TensorDistAttr()
+        x_dist_attr.dims_mapping = [-1, -1, -1]
+        x_dist_attr.process_mesh = process_mesh
+        self.x_spec = DistTensorSpec(x_shape, x_dist_attr)
+
+        index_dist_attr = TensorDistAttr()
+        index_dist_attr.dims_mapping = [-1]
+        index_dist_attr.process_mesh = process_mesh
+        self.index_spec = DistTensorSpec(index_shape, index_dist_attr)
+
+    def test_single_mesh_dim(self):
+        # axis: 0
+        # dims_mapping: [0, -1, -1], [-1] --> [-1, -1, -1], [-1], [-1, -1, -1]
+        self.attrs['axis'] = 0
+        self.x_spec.set_dims_mapping([0, -1, -1])
+        self.index_spec.set_dims_mapping([-1])
+
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_spec,
+            self.index_spec,
+            self.attrs['axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 2)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1]
+        )
+
+        # axis: 0
+        # dims_mapping: [-1, 0, -1], [-1] --> [-1, 0, -1], [-1], [-1, 0, -1]
+        self.attrs['axis'] = 0
+        self.x_spec.set_dims_mapping([-1, 0, -1])
+        self.index_spec.set_dims_mapping([-1])
+
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_spec,
+            self.index_spec,
+            self.attrs['axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 2)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, 0, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0, -1])
+
+        # axis: 0
+        # dims_mapping: [0, -1, -1], [0] --> [-1, -1, -1], [0], [0, -1, -1]
+        self.attrs['axis'] = 0
+        self.x_spec.set_dims_mapping([0, -1, -1])
+        self.index_spec.set_dims_mapping([0])
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_spec,
+            self.index_spec,
+            self.attrs['axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 2)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+
+        # 0-d tensor
+        # axis: 1
+        # dims_mapping: [-1, 0, -1], [0] --> [-1, -1, -1], [-1], [-1, -1]
+        self.attrs['axis'] = 1
+        self.index_spec.shape = []
+        self.x_spec.set_dims_mapping([-1, 0, -1])
+        self.index_spec.set_dims_mapping([0])
+
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_spec,
+            self.index_spec,
+            self.attrs['axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 2)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, -1])
+        self.index_spec.shape = [16]
+
+    def test_multi_mesh_dim(self):
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+        self.x_spec.set_process_mesh(process_mesh)
+        self.index_spec.set_process_mesh(process_mesh)
+
+        # axis = 1
+        # [0, 1, -1], [1] --> [0, -1, -1], [1], [0, 1, -1]
+        self.attrs['axis'] = 1
+        self.x_spec.set_dims_mapping([0, 1, -1])
+        self.index_spec.set_dims_mapping([1])
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_spec,
+            self.index_spec,
+            self.attrs['axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 2)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, 1, -1])
+
+        # [0, 1, -1], [0] --> [0, -1, -1], [0], [0, -1, -1]
+        self.attrs['axis'] = 1
+        self.x_spec.set_dims_mapping([0, 1, -1])
+        self.index_spec.set_dims_mapping([0])
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_spec,
+            self.index_spec,
+            self.attrs['axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 2)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+
+    def test_reverse_multi_mesh_dim(self):
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+        self.x_spec.set_process_mesh(process_mesh)
+        self.index_spec.set_process_mesh(process_mesh)
+        self.out_spec = DistTensorSpec(self.x_spec)
+
+        # axis = 1
+        # [1, 0, -1] --> [1, -1, -1], [0], [1, 0, -1]
+        self.attrs['axis'] = 1
+        self.out_spec.set_dims_mapping([1, 0, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_spec,
+            self.index_spec,
+            self.out_spec,
+            self.attrs['axis'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 2)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, 0, -1])
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2a05a3832e0c71876366342846d3ab95d2e296d9 Mon Sep 17 00:00:00 2001
From: Jia Wenxuan <64853160+JiaWenxuan@users.noreply.github.com>
Date: Wed, 6 Mar 2024 14:58:37 +0800
Subject: [PATCH 018/114] fix ShapeOrData == error (#62437)

---
 paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h b/paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h
index b4a537a9a0d6b..b57fed0dab66c 100644
--- a/paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h
+++ b/paddle/pir/include/dialect/shape/utils/shape_or_data_expr.h
@@ -60,7 +60,7 @@ class ShapeOrData {
   bool operator==(const ShapeOrData<T>& other) const {
     if (data_.has_value() && !other.data_.has_value()) return false;
     if (!data_.has_value() && other.data_.has_value()) return false;
-    if (shape_.size() != shape_.size()) return false;
+    if (shape_.size() != other.shape_.size()) return false;
 
     if (data_.has_value() && other.data_.has_value()) {
       if (data_.value().size() != other.data_.value().size()) return false;

From 316fdfb23a9409bb739f6c62c79dd025920c037b Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Wed, 6 Mar 2024 15:01:23 +0800
Subject: [PATCH 019/114] [PIR] [DyShape] Add fix increment infer mannul op
 (#62438)

* fix increment

* add increment_
---
 .../fluid/pir/dialect/operator/ir/manual_op.cc   | 16 ++++++++++++++++
 paddle/fluid/pir/dialect/operator/ir/manual_op.h |  4 ++++
 2 files changed, 20 insertions(+)

diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index 5a930b04fdf64..f8e02c5b52d6d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -3696,6 +3696,14 @@ phi::DataType IncrementOp::GetKernelTypeForVar(
   return expected_kernel_dtype;
 }
 
+bool IncrementOp::InferSymbolicShape(
+    pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(x());
+  shape_analysis->SetShapeOrDataForValue(out(), operand_shape_or_data);
+  return true;
+}
+
 const char *Increment_Op::attributes_name[1] = {"value"};
 
 OpInfoTuple Increment_Op::GetOpInfo() {
@@ -3878,6 +3886,14 @@ phi::DataType Increment_Op::GetKernelTypeForVar(
   return expected_kernel_dtype;
 }
 
+bool Increment_Op::InferSymbolicShape(
+    pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(x());
+  shape_analysis->SetShapeOrDataForValue(out(), operand_shape_or_data);
+  return true;
+}
+
 OpInfoTuple AssignOut_Op::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
       paddle::dialect::OpInputInfo(
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.h b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
index 1f8be853ddcf5..36feddf569dad 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
@@ -565,6 +565,7 @@ class IncrementOp
     : public pir::Op<IncrementOp,
                      paddle::dialect::OpYamlInfoInterface,
                      paddle::dialect::InferMetaInterface,
+                     paddle::dialect::InferSymbolicShapeInterface,
                      paddle::dialect::VjpInterface,
                      paddle::dialect::GetKernelTypeForVarInterface> {
  public:
@@ -603,12 +604,14 @@ class IncrementOp
       const std::vector<std::vector<pir::Value>> &outputs,
       const std::vector<std::vector<pir::Value>> &out_grads,
       const std::vector<std::vector<bool>> &stop_gradients);
+  bool InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis);
 };
 
 class Increment_Op
     : public pir::Op<Increment_Op,
                      paddle::dialect::OpYamlInfoInterface,
                      paddle::dialect::InferMetaInterface,
+                     paddle::dialect::InferSymbolicShapeInterface,
                      paddle::dialect::VjpInterface,
                      paddle::dialect::GetKernelTypeForVarInterface,
                      paddle::dialect::InplaceTrait> {
@@ -648,6 +651,7 @@ class Increment_Op
       const std::vector<std::vector<pir::Value>> &outputs,
       const std::vector<std::vector<pir::Value>> &out_grads,
       const std::vector<std::vector<bool>> &stop_gradients);
+  bool InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis);
 };
 
 class AssignOut_Op

From ce649b1d58ba86493d9cd1f3ae11764e95806498 Mon Sep 17 00:00:00 2001
From: zhengzhonghui <zhengzhonghui@baidu.com>
Date: Wed, 6 Mar 2024 15:07:34 +0800
Subject: [PATCH 020/114] [AutoParallel] unify llama model && fix vpp unittest
 hang problem (#62294)

* [AutoParallel] unify llama model

* fix comment

* fix hang bug && enable vpp unittest

* polish

* keep concrete_program.parameters in order
---
 .../jit/dy2static/program_translator.py       |   4 +-
 .../jit/pir_dy2static/parameter_recorder.py   |   8 +-
 .../hybrid_strategy/CMakeLists.txt            |   8 +
 .../semi_auto_parallel_llama_model.py         | 180 ++++++++----------
 .../hybrid_strategy/testslist.csv             |   1 +
 5 files changed, 92 insertions(+), 109 deletions(-)

diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 330ce0c146fac..bf82d0337f510 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -1395,7 +1395,9 @@ def pop(self, program):
         if params is None:
             return []
         del self.params_dict[_program_hash(program)]
-        return list(params)
+        params = list(params)
+        params.sort(key=lambda x: x.name)
+        return params
 
 
 class InplaceMap:
diff --git a/python/paddle/jit/pir_dy2static/parameter_recorder.py b/python/paddle/jit/pir_dy2static/parameter_recorder.py
index 1c5aa2fd6981f..ef0440eaa981b 100644
--- a/python/paddle/jit/pir_dy2static/parameter_recorder.py
+++ b/python/paddle/jit/pir_dy2static/parameter_recorder.py
@@ -53,12 +53,12 @@ def pop(self, program):
         params = self.params_dict.get(hash_id)
         if params is None:
             return [], []
-        params_values = [
-            self.tensor2value[hash_id][id(x)] for x in list(params)
-        ]
+        params = list(params)
+        params.sort(key=lambda x: x.name)
+        params_values = [self.tensor2value[hash_id][id(x)] for x in params]
         del self.params_dict[hash_id]
         del self.tensor2value[hash_id]
-        return list(params), list(params_values)
+        return params, params_values
 
 
 class InplaceMap:
diff --git a/test/auto_parallel/hybrid_strategy/CMakeLists.txt b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
index 063b1b5873e74..f6e31047c7b4e 100644
--- a/test/auto_parallel/hybrid_strategy/CMakeLists.txt
+++ b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
@@ -81,3 +81,11 @@ if((WITH_GPU) AND (LINUX))
   set_tests_properties(test_semi_auto_parallel_multi_inputs
                        PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=HYBRID")
 endif()
+if((WITH_GPU) AND (LINUX))
+  py_test_modules(
+    test_semi_auto_parallel_llama_model_vpp MODULES
+    test_semi_auto_parallel_llama_model_vpp ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_semi_auto_parallel_llama_model_vpp
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=HYBRID")
+endif()
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_llama_model.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_llama_model.py
index 95a7d9670f663..6112db6aa9839 100644
--- a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_llama_model.py
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_llama_model.py
@@ -35,17 +35,30 @@ def set_global_mesh(mesh):
     _global_mesh = mesh
 
 
+def is_pp_enable(mesh):
+    return "pp" in mesh.dim_names
+
+
 def get_mesh(pp_idx=None):
     global _global_mesh
     mesh = _global_mesh
     assert _global_mesh is not None, "_global_mesh is not initialized!"
     if pp_idx is None:
         return mesh
-    if "pp" in _global_mesh.dim_names:
+    if is_pp_enable(mesh):
         mesh = _global_mesh.get_mesh_with_dim("pp")[pp_idx]
     return mesh
 
 
+def global_mesh_starts_with_pp():
+    global _global_mesh
+    mesh = _global_mesh
+    if is_pp_enable(mesh):
+        return _global_mesh.get_mesh_with_dim("pp")
+    else:
+        return mesh
+
+
 class LlamaRotaryEmbedding(nn.Layer):
     def __init__(self, dim, max_position_embeddings=2048, base=10000):
         super().__init__()
@@ -348,20 +361,10 @@ def __init__(self, config):
         self.config = config
 
     def forward(self, hidden_states):
-        if paddle.in_dynamic_mode():
-            variance = (
-                hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
-            )
-            hidden_states = (
-                paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
-            )
-        else:
-            variance = (
-                hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
-            )
-            hidden_states = (
-                paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
-            )
+        variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+        hidden_states = (
+            paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        )
 
         if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
             hidden_states = paddle.cast(hidden_states, self.weight.dtype)
@@ -489,24 +492,31 @@ def __init__(self, config):
             [dist.Replicate(), dist.Shard(1)],
         )
 
-        def get_layer_ipp(layer_index):
+        def get_layer_pp_info(layer_index):
             global _global_mesh
             mesh = _global_mesh
-            if "pp" not in mesh.dim_names:
-                return None
+            if is_pp_enable(mesh) is False:
+                return None, False
             else:
                 pp_degree = mesh.get_dim_size("pp")
                 layer_per_stage = math.ceil(
                     config.num_hidden_layers / pp_degree
                 )
-                return layer_index // layer_per_stage
+                input_need_reshard = layer_index % layer_per_stage == 0
+                return layer_index // layer_per_stage, input_need_reshard
+
+        decoder_layers = []
+        self.next_pp_stage_indexes = []
+        for i in range(config.num_hidden_layers):
+            pp_stage_id, input_need_reshard = get_layer_pp_info(i)
+            decoder_layers.append(
+                LlamaDecoderLayerAuto(config, False, pp_stage_id)
+            )
+            if input_need_reshard:
+                self.next_pp_stage_indexes.append(i)
+
+        self.layers = nn.LayerList(decoder_layers)
 
-        self.layers = nn.LayerList(
-            [
-                LlamaDecoderLayerAuto(config, False, get_layer_ipp(i))
-                for i in range(config.num_hidden_layers)
-            ]
-        )
         self.norm = LlamaRMSNormAuto(config)
 
         self.gradient_checkpointing = False
@@ -533,11 +543,6 @@ def _prepare_decoder_attention_mask(
                         input_shape,
                         past_key_values_length=past_key_values_length,
                     )
-                    combined_attention_mask = dist.shard_tensor(
-                        combined_attention_mask,
-                        mesh,
-                        [dist.Replicate() for _ in range(len(mesh._shape))],
-                    )
                     expanded_attn_mask = (
                         expanded_attn_mask & combined_attention_mask
                     )
@@ -579,14 +584,6 @@ def forward(
             use_cache if use_cache is not None else self.config.use_cache
         )
 
-        if (
-            not paddle.in_dynamic_mode()
-            and getattr(self.config, "virtual_pp_degree", 1) > 1
-        ):
-            # NOTE: temprorary method to guarantee the later ops are placed on all ranks until meeting new annotaion.
-            full = dist.shard_op(paddle.full, get_mesh())
-            full(shape=[1], fill_value=0)
-
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError(
@@ -610,14 +607,6 @@ def forward(
             cache_length = paddle.shape(past_key_values[0][0])[1]
             seq_length_with_past += cache_length
 
-        if (
-            not paddle.in_dynamic_mode()
-            and getattr(self.config, "virtual_pp_degree", 1) > 1
-        ):
-            # NOTE: temprorary method to guarantee the later ops are placed on pp stage 0 until meeting new annotaion.
-            full = dist.shard_op(paddle.full, get_mesh(0))
-            full(shape=[1], fill_value=0)
-
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
@@ -625,34 +614,13 @@ def forward(
             # [B, S, H] -> [S, B, H]
             inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2])
 
-        if (
-            not paddle.in_dynamic_mode()
-            and getattr(self.config, "virtual_pp_degree", 1) > 1
-        ):
-            # NOTE: temprorary method to guarantee the later ops are placed on all ranks until meeting new annotaion.
-            full = dist.shard_op(paddle.full, get_mesh())
-            full(shape=[1], fill_value=0)
-            mesh = get_mesh()
-        else:
-            mesh = get_mesh(0)
-
+        mesh = global_mesh_starts_with_pp()
         # embed positions
         if attention_mask is None:
             # [bs, seq_len]
             attention_mask = paddle.ones(
                 (batch_size, seq_length_with_past), dtype=paddle.bool
             )
-
-        if position_ids is None:
-            position_ids = paddle.arange(seq_length, dtype="int64").expand(
-                (batch_size, seq_length)
-            )
-            position_ids = dist.shard_tensor(
-                position_ids,
-                mesh,
-                [dist.Replicate() for _ in range(len(mesh._shape))],
-            )
-
         attention_mask = self._prepare_decoder_attention_mask(
             attention_mask,
             (batch_size, seq_length),
@@ -660,6 +628,22 @@ def forward(
             inputs_embeds.dtype,
             mesh,
         )  # [bs, 1, seq_len, seq_len]
+        attention_mask = dist.shard_tensor(
+            attention_mask,
+            mesh,
+            [dist.Replicate() for _ in range(len(mesh._shape))],
+        )
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand(
+                (batch_size, seq_length)
+            )
+        position_ids = dist.shard_tensor(
+            position_ids,
+            mesh,
+            [dist.Replicate() for _ in range(len(mesh._shape))],
+        )
+
         if self.config.use_flash_attention:
             is_casual = is_casual_mask(attention_mask)
             if is_casual:
@@ -674,7 +658,6 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = () if use_cache else None
 
-        pre_ipp = None
         for idx, (decoder_layer) in enumerate(self.layers):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -682,36 +665,26 @@ def forward(
                 past_key_values[idx] if past_key_values is not None else None
             )
 
-            has_gradient = not hidden_states.stop_gradient
-            ipp = decoder_layer.ipp
-
-            if ipp is not None and pre_ipp != ipp:
-                if (
-                    not paddle.in_dynamic_mode()
-                    and getattr(self.config, "virtual_pp_degree", 1) > 1
-                ):
-                    hidden_states = dist.reshard(
-                        hidden_states,
-                        get_mesh(ipp),
-                        self.placements,
-                    )
-                    decoder_layer = dist.shard_op(decoder_layer, get_mesh(ipp))
-                else:
-                    hidden_states = dist.reshard(
-                        hidden_states,
-                        get_mesh(ipp),
-                        self.placements,
-                    )
-                    position_ids = dist.reshard(
-                        position_ids,
-                        get_mesh(ipp),
-                        [dist.Shard(0), dist.Replicate()],
-                    )
-                    attention_mask = dist.reshard(
-                        attention_mask,
-                        get_mesh(ipp),
-                        [dist.Shard(0), dist.Replicate()],
-                    )
+            if not is_pp_enable(get_mesh()):
+                position_ids_input = position_ids
+                attention_mask_input = attention_mask
+            elif idx in self.next_pp_stage_indexes:
+                ipp = decoder_layer.ipp
+                position_ids_input = dist.reshard(
+                    position_ids,
+                    get_mesh(ipp),
+                    [dist.Replicate(), dist.Replicate()],
+                )
+                attention_mask_input = dist.reshard(
+                    attention_mask,
+                    get_mesh(ipp),
+                    [dist.Replicate(), dist.Replicate()],
+                )
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(ipp),
+                    self.placements,
+                )
 
             if (
                 self.config.recompute
@@ -720,8 +693,8 @@ def forward(
                 layer_outputs = recompute(
                     decoder_layer,
                     hidden_states,
-                    position_ids,
-                    attention_mask,
+                    position_ids_input,
+                    attention_mask_input,
                     output_attentions,
                     past_key_value,
                     use_cache,
@@ -730,13 +703,12 @@ def forward(
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    position_ids,
-                    attention_mask,
+                    position_ids_input,
+                    attention_mask_input,
                     output_attentions,
                     past_key_value,
                     use_cache,
                 )
-            pre_ipp = ipp
 
             if type(layer_outputs) is tuple:
                 hidden_states = layer_outputs[0]
diff --git a/test/auto_parallel/hybrid_strategy/testslist.csv b/test/auto_parallel/hybrid_strategy/testslist.csv
index 2fac60515b51a..65fc44806c055 100644
--- a/test/auto_parallel/hybrid_strategy/testslist.csv
+++ b/test/auto_parallel/hybrid_strategy/testslist.csv
@@ -9,3 +9,4 @@ test_semi_auto_parallel_hybrid_sharding_strategy,LINUX,GPU,120,HYBRID,test_runne
 test_global_mesh_reshard,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_semi_auto_parallel_global_input,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_semi_auto_parallel_multi_inputs,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_semi_auto_parallel_llama_model_vpp,LINUX,GPU,180,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,

From af00becf582ebcd7685fa8e6b87ffb47c798c83f Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Wed, 6 Mar 2024 15:35:36 +0800
Subject: [PATCH 021/114] [Prim] Optimize composite OP silu_double_grad
 (#62112)

* optimize composite OP silu_double_grad

* correct computation equation

* use grad_x_grad_mul_sigmoid to reduce duplicated computation
---
 .../api/composite_backward/composite_double_backward_api.h   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
index 02bd7e29443c0..9a1c3ec4d2112 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_double_backward_api.h
@@ -443,12 +443,13 @@ void silu_double_grad(const Tensor& x,
   auto sigmoid = 1 / (1 + exp<T>(-x));
   auto tmp1 = 1 - sigmoid;
   auto tmp2 = 1 + tmp1 * x;
+  auto grad_x_grad_mul_sigmoid = grad_x_grad * sigmoid;
   if (grad_out_grad) {
-    auto ddout = grad_x_grad * sigmoid * tmp2;
+    auto ddout = grad_x_grad_mul_sigmoid * tmp2;
     set_output<T>(ddout, grad_out_grad);
   }
   if (grad_x) {
-    auto dx = sigmoid * grad_x_grad * out_grad * (1 + (tmp2 - out)) * tmp1;
+    auto dx = grad_x_grad_mul_sigmoid * out_grad * (1 + (tmp2 - out)) * tmp1;
     set_output<T>(dx, grad_x);
   }
 }

From dcf2de5efc264b108fd730a89a942701c5816a65 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 6 Mar 2024 16:17:16 +0800
Subject: [PATCH 022/114] [CINN]support spatial dynamic (#62444)

* support spatial dynamic

* fix bug
---
 .../hlir/framework/pir/op_lowering_impl.cc    | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index dbecb0f72ad52..466733491cea7 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -97,18 +97,27 @@ std::shared_ptr<cinn::ir::GroupTileInfo> OpLowererImpl::GetGroupTileInfo(
   int64_t spatial_numel = 1;
   int64_t reduce_numel = 1;
 
+  bool spatial_is_dynamic = false;
+  bool reduce_is_dynamic = false;
   for (int64_t i = 0; i < group_tile_info->data_rank; ++i) {
     if (reduce_set.count(i)) {
       reduce_numel *= data_dim[i];
+      if (data_dim[i] < 0) {
+        reduce_is_dynamic = true;
+      }
     } else {
       spatial_numel *= data_dim[i];
+
+      if (data_dim[i] < 0) {
+        spatial_is_dynamic = true;
+      }
     }
   }
 
-  PADDLE_ENFORCE_GT(
-      reduce_numel,
-      0,
-      phi::errors::Unimplemented("negative reduce numel or flaten numel"));
+  PADDLE_ENFORCE_EQ(
+      reduce_is_dynamic,
+      false,
+      phi::errors::Unimplemented("not support dynamic reduce yet"));
 
   int64_t reduce_block = 1;
   int64_t spatial_block = 1;
@@ -119,16 +128,13 @@ std::shared_ptr<cinn::ir::GroupTileInfo> OpLowererImpl::GetGroupTileInfo(
 
   if (reduce_numel == 1) {
     reduce_block = 1;
-    if (spatial_numel < 0) {
+    if (spatial_is_dynamic) {
       spatial_block = 1024;
 
       reduce_inner_num = 1;
-      warp_num = spatial_block / 128;
+      warp_num = 8;
 
-      spatial_inner_num = spatial_block / (warp_num * 32);
-      if (spatial_inner_num == 0) {
-        spatial_inner_num = 1;
-      }
+      spatial_inner_num = 4;
 
       group_tile_info->block_num = -1;
     } else {

From de777d856f2f81d700082ab300a94582625ff2b0 Mon Sep 17 00:00:00 2001
From: Shaopeng Ling <silver.ling@outlook.com>
Date: Wed, 6 Mar 2024 16:43:25 +0800
Subject: [PATCH 023/114] [HACKATHON 6th][CMake Optimization] use new cmake
 policy CMP0135 for third party dependences (#62454)

---
 cmake/third_party.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 2d8020adcf7d0..4723110a7b57a 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -15,6 +15,11 @@
 include(ExternalProject)
 # Create a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
 
+# Avoid warning about DOWNLOAD_EXTRACT_TIMESTAMP in CMake 3.24
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+  cmake_policy(SET CMP0135 NEW)
+endif()
+
 set(THIRD_PARTY_PATH
     "${CMAKE_BINARY_DIR}/third_party"
     CACHE STRING

From 3de4a22a1de7086885f7c7d6ee426ad5e6853d10 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Wed, 6 Mar 2024 17:02:17 +0800
Subject: [PATCH 024/114] support dist tensor in reshape api (#62420)

---
 paddle/fluid/pybind/eager_method.cc           | 31 ++++++++++++
 test/auto_parallel/CMakeLists.txt             |  2 +-
 .../semi_auto_parallel_for_item.py            | 47 +++++++++++++++++++
 .../semi_auto_parallel_for_reshape.py         | 11 +++++
 .../test_semi_auto_parallel_basic.py          | 10 ++++
 5 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 test/auto_parallel/semi_auto_parallel_for_item.py

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 16d5fea43fe76..a1520075e03ee 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1449,10 +1449,41 @@ static PyObject* tensor__getitem_from_offset(TensorObject* self,
                                              PyObject* kwargs) {
   EAGER_TRY
   phi::DenseTensor* ptr = nullptr;
+  phi::DenseTensor tensor_after_reshard;
   if (self->tensor.is_selected_rows()) {
     auto* selected_rows =
         static_cast<phi::SelectedRows*>(self->tensor.impl().get());
     ptr = static_cast<phi::DenseTensor*>(selected_rows->mutable_value());
+  } else if (self->tensor.is_dist_tensor()) {
+#ifdef PADDLE_WITH_DISTRIBUTE
+    auto* dist_tensor =
+        static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get());
+    PADDLE_ENFORCE(
+        dist_tensor->initialized(),
+        paddle::platform::errors::Fatal(
+            "The input dist tensor can't be uninitialized for we don't "
+            "know the correct mesh to be reshard."));
+    const auto& placements = dist_tensor->placements();
+    bool need_reshard = false;
+    for (const auto& placement : placements) {
+      if (!placement->is_replicated()) {
+        need_reshard = true;
+        break;
+      }
+    }
+    if (need_reshard) {
+      tensor_after_reshard = ReshardXToReplicated(dist_tensor);
+      ptr = &tensor_after_reshard;
+    } else {
+      ptr = dist_tensor->unsafe_mutable_value();
+    }
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "The `_getitem_from_offset` method of (Dist)Tensor is not supported "
+        "in the current PaddlePaddle, please recompile and install "
+        "PaddlePaddle "
+        "with the option of `WITH_DISTRIBUTE=ON`."));
+#endif
   } else {
     ptr = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
   }
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index a72e7831e1a13..1d448cb5f6ecb 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -184,7 +184,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
   py_test_modules(test_dist_tensor_api MODULES test_dist_tensor_api)
   set_tests_properties(test_dist_tensor_api
-                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 200)
   py_test_modules(test_gpt_with_pir MODULES test_gpt_with_pir)
   set_tests_properties(test_gpt_with_pir PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
                                                     TIMEOUT 100)
diff --git a/test/auto_parallel/semi_auto_parallel_for_item.py b/test/auto_parallel/semi_auto_parallel_for_item.py
new file mode 100644
index 0000000000000..245da5f6646cd
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_for_item.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from semi_auto_parallel_util import SemiAutoParallelTestBase
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestItemApiForSemiAutoParallel(SemiAutoParallelTestBase):
+    def __init__(self):
+        super().__init__()
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+
+    def test_item_api(self):
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        a = paddle.rand(shape=[6, 8])
+        b = dist.shard_tensor(a, mesh, [dist.Shard(0)])
+        np.testing.assert_equal(b.item(0, 0), a[0][0].item())
+        np.testing.assert_equal(b.item(3, 5), a[3][5].item())
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.test_item_api()
+
+
+if __name__ == '__main__':
+    TestItemApiForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/semi_auto_parallel_for_reshape.py b/test/auto_parallel/semi_auto_parallel_for_reshape.py
index ac194353655b7..44ca5a0c226b5 100644
--- a/test/auto_parallel/semi_auto_parallel_for_reshape.py
+++ b/test/auto_parallel/semi_auto_parallel_for_reshape.py
@@ -55,6 +55,16 @@ def test_reshape_infer_shape(self):
         assert y.shape == [30, 20, 10]
         assert y._local_shape == [15, 20, 10]
 
+    def test_shape_api_with_reshape(self):
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        a = paddle.rand(shape=[4, 6, 8])
+        b = dist.shard_tensor(a, mesh, [dist.Shard(0)])
+
+        dist_shape = paddle.shape(b)
+        b = b.reshape((-1, dist_shape[-1]))
+        assert b.shape == [24, 8]
+        assert b._local_shape == [12, 8]
+
     def run_test_case(self):
         if self._backend == "cpu":
             paddle.set_device("cpu")
@@ -64,6 +74,7 @@ def run_test_case(self):
             raise ValueError("Only support cpu or gpu backend.")
         self.test_reshape_forward()
         self.test_reshape_infer_shape()
+        self.test_shape_api_with_reshape()
 
 
 if __name__ == '__main__':
diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py
index 91b826e8142a8..6b0204fc0fe8c 100644
--- a/test/auto_parallel/test_semi_auto_parallel_basic.py
+++ b/test/auto_parallel/test_semi_auto_parallel_basic.py
@@ -200,6 +200,16 @@ def test_reshape_api(self):
                 user_defined_envs=envs,
             )
 
+    def test_item_api(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_for_item.py",
+                user_defined_envs=envs,
+            )
+
     def test_squeeze_api(self):
         envs_list = test_base.gen_product_envs_list(
             self._default_envs, self._changeable_envs

From 948a1b0be1d581bea83f3f59c7422f35965215ab Mon Sep 17 00:00:00 2001
From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Date: Wed, 6 Mar 2024 17:04:28 +0800
Subject: [PATCH 025/114] fix bugs (#62428)

---
 tools/auto_parallel/ci_case_unit.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/auto_parallel/ci_case_unit.sh b/tools/auto_parallel/ci_case_unit.sh
index 0747cb4bb0c4d..b3c250858ee2f 100644
--- a/tools/auto_parallel/ci_case_unit.sh
+++ b/tools/auto_parallel/ci_case_unit.sh
@@ -31,6 +31,7 @@ function case_list_unit() {
         case_name=`awk -F, 'NR=='$i' {print $1}' testslist.csv`
         if [[ ${target_key} != "all" ]] && [[ ! ${case_name} =~ ${target_key} ]]; then
             echo "=========== skip $case_name run  ==========="
+            continue
         else
             echo "=========== $case_name run  begin ==========="
         fi
@@ -51,13 +52,13 @@ main() {
     export exec_case=$1
     echo -e "\033[31m ---- Start executing $exec_case case \033[0m"
 
-    if [[ $exec_case =~ "auto_unit_test" ]];then
+    if [[ $exec_case == "auto_unit_test" ]];then
         cd ${auto_case_path}
         case_list_unit
-    elif [[ $exec_case =~ "dygraph_unit_test" ]];then
+    elif [[ $exec_case == "dygraph_unit_test" ]];then
         cd ${dygraph_case_path}
         case_list_unit
-    elif [[ $exec_case =~ "llama_auto_unit_test" ]];then
+    elif [[ $exec_case == "llama_auto_unit_test" ]];then
         cd ${auto_case_path}
         case_list_unit llama
     else

From eb639c6017156f8150c91cce4cf0109a2924f4da Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Wed, 6 Mar 2024 18:04:20 +0800
Subject: [PATCH 026/114] Fix check_depency check_dependency, etc (#62458)

---
 .../group_merge/group_with_group_merge_pass.cc     |  2 +-
 .../group_merge/group_with_group_merge_util.h      |  6 +++---
 .../group_merge/op_with_group_merge_util.h         | 14 +++++++-------
 paddle/cinn/hlir/pass/fusion_merge_pass.cc         |  6 +++---
 paddle/cinn/hlir/pass/fusion_merge_pass_util.h     |  4 ++--
 paddle/cinn/hlir/pass/general_fusion_merge_pass.cc |  2 +-
 paddle/cinn/hlir/pass/op_fusion_pass_util.h        | 10 +++++-----
 .../paddle2cinn/cinn_subgraph_detector.cc          |  8 ++++----
 .../framework/paddle2cinn/cinn_subgraph_detector.h |  2 +-
 9 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
index 7ee55cc7c9396..4b5f65747e929 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
@@ -2220,7 +2220,7 @@ class GeneralFusionMergePassHelper {
 
 GroupList GeneralFusionMergePassInternal(const GroupList& group_list) {
   if (group_list.size() <= 1) {
-    VLOG(3) << "Don't do Fusoin Merge Pass...!";
+    VLOG(3) << "Don't do Fusion Merge Pass...!";
     return group_list;
   }
 
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
index f6c17ae28ebfb..f04ee9212f9f3 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
@@ -146,7 +146,7 @@ inline bool horizontal_elementwise_fuse_reduce(
   auto ele_node_shape =
       GetValueShape((*ele_group->master_ops.begin())->result(0));
   int32_t size_ele = ::common::product(ele_node_shape);
-  // TODO(phlrain): seems extrame danger herem, why compare multi Master Node?
+  // TODO(phlrain): seems extreme danger here, why compare multi Master Node?
   for (auto* master : reduce_group->master_ops) {
     auto master_node_shape = GetValueShape(master->result(0));
     int32_t size_master = ::common::product(master_node_shape);
@@ -349,7 +349,7 @@ inline bool horizontal_relation(const std::shared_ptr<ir::Group>& first,
   };
   auto selected_nodes = select_node_set(second_set, op_pattern_kind);
 
-  auto check_depency = [&](::pir::Operation* node) {
+  auto check_dependency = [&](::pir::Operation* node) {
     std::queue<::pir::Operation*> candidates;
     std::unordered_set<::pir::Operation*> visited_set;
     candidates.push(node);
@@ -381,7 +381,7 @@ inline bool horizontal_relation(const std::shared_ptr<ir::Group>& first,
   };
 
   for (auto node : selected_nodes) {
-    if (check_depency(node)) {
+    if (check_dependency(node)) {
       return false;
     }
   }
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h
index 038e49b8b553a..4fbe41385ec62 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_util.h
@@ -181,7 +181,7 @@ inline bool reduce_fuse_reduce(
 
 inline bool is_horizontal_relation(::pir::Operation* producer,
                                    const std::shared_ptr<Group>& consumer) {
-  auto check_depency = [&](::pir::Operation* op) {
+  auto check_dependency = [&](::pir::Operation* op) {
     std::queue<::pir::Operation*> candidates;
     std::unordered_set<::pir::Operation*> visited_set;
     candidates.push(op);
@@ -192,7 +192,7 @@ inline bool is_horizontal_relation(::pir::Operation* producer,
       // visit all producer op
       for (size_t i = 0; i < candidate->num_operands(); ++i) {
         auto tmp_op = candidate->operand_source(i).defining_op();
-        // check depency.
+        // check dependency.
         if (producer == tmp_op) {
           return true;
         }
@@ -216,7 +216,7 @@ inline bool is_horizontal_relation(::pir::Operation* producer,
         consumer->op_pattern_kind) {
       continue;
     }
-    if (check_depency(op)) {
+    if (check_dependency(op)) {
       return false;
     }
   }
@@ -276,22 +276,22 @@ inline bool horizontal_or_vertical_reduce_relation(
     return false;
   }
 
-  int succesive_reduce_dimension = reduce_shape.at(reduce_axes.back());
+  int successive_reduce_dimension = reduce_shape.at(reduce_axes.back());
   for (int idx = reduce_axes.size() - 2; idx >= 0; --idx) {
     if (reduce_axes[idx] == reduce_axes[idx + 1] - 1) {
-      succesive_reduce_dimension *= reduce_shape[reduce_axes[idx]];
+      successive_reduce_dimension *= reduce_shape[reduce_axes[idx]];
       continue;
     }
     break;
   }
 
   // helper->target_ == cinn::common::DefaultNVGPUTarget()
-  // succesive_reduce_dimension <= helper->target_.max_num_threads()
+  // successive_reduce_dimension <= helper->target_.max_num_threads()
   // TODO(phlrain): support is_gpu_target and max_thread
   bool is_gpu_target = true;
   int max_thread = 32 * 1024;
   return is_gpu_target
-             ? (succesive_reduce_dimension <= max_thread ? true : false)
+             ? (successive_reduce_dimension <= max_thread ? true : false)
              : true;
 }
 
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
index eb251fca8608e..9381ba0f5b2f3 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
@@ -199,13 +199,13 @@ class FusionMergePassHelper : public FusionHelperBase {
       // check dependency
       if (IsDependencySimplify(producer, candidate, candidates)) {
         VLOG(4) << "IsDependencySimplify, Can't fuse " << candidate->group_id
-                << ", As it depency others!";
+                << ", As it dependency others!";
         continue;
       }
 
       if (IsDependency(producer, candidate, candidates)) {
         VLOG(4) << "IsDependency, Can't fuse " << candidate->group_id
-                << ", As it depency others!";
+                << ", As it dependency others!";
         continue;
       }
 
@@ -698,7 +698,7 @@ class FusionMergePassHelper : public FusionHelperBase {
           sub_group->nodes.insert(sub_group->nodes.begin(),
                                   producer->CollectNodes()[0]);
           sub_group->nodes_set.insert(producer->CollectNodes()[0]);
-          // remove depency.
+          // remove dependency.
           consumer->input_nodes.erase(producer->CollectNodes()[0]);
           consumer->mut_producer_groups()->erase(producer);
           producer->mut_consumer_groups()->erase(consumer);
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
index 219d08d7d08e6..5541ec09bc178 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
@@ -330,7 +330,7 @@ inline bool horizontal_relation(
   };
   auto selected_nodes = select_node_set(second_set, op_pattern_kind);
 
-  auto check_depency = [&](const Node* node) {
+  auto check_dependency = [&](const Node* node) {
     std::queue<const Node*> candidates;
     std::unordered_set<const Node*> visited_set;
     candidates.push(node);
@@ -360,7 +360,7 @@ inline bool horizontal_relation(
   };
 
   for (auto node : selected_nodes) {
-    if (check_depency(node)) {
+    if (check_dependency(node)) {
       return false;
     }
   }
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
index 65d0d9eb7c243..d527223cff158 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
@@ -833,7 +833,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
           sub_group->nodes.insert(sub_group->nodes.begin(),
                                   producer->CollectNodes()[0]);
           sub_group->nodes_set.insert(producer->CollectNodes()[0]);
-          // remove depency.
+          // remove dependency.
           consumer->input_nodes.erase(producer->CollectNodes()[0]);
           consumer->mut_producer_groups()->erase(producer);
           producer->mut_consumer_groups()->erase(consumer);
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_util.h b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
index c8af3db911689..12eece98e1327 100644
--- a/paddle/cinn/hlir/pass/op_fusion_pass_util.h
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
@@ -124,7 +124,7 @@ CONDITION_FUNC(reduce_fuse_reduce) {
 }
 
 CONDITION_FUNC(is_horizontal_relation) {
-  auto check_depency = [&](const Node* node) {
+  auto check_dependency = [&](const Node* node) {
     std::queue<const Node*> candidates;
     std::unordered_set<const Node*> visited_set;
     candidates.push(node);
@@ -157,7 +157,7 @@ CONDITION_FUNC(is_horizontal_relation) {
     if (helper->GetOpKind(node) != consumer->op_pattern_kind) {
       continue;
     }
-    if (check_depency(node)) {
+    if (check_dependency(node)) {
       return false;
     }
   }
@@ -207,17 +207,17 @@ CONDITION_FUNC(horizontal_or_vertical_reduce_relation) {
     return false;
   }
 
-  int succesive_reduce_dimension = reduce_shape.at(reduce_axes.back());
+  int successive_reduce_dimension = reduce_shape.at(reduce_axes.back());
   for (int idx = reduce_axes.size() - 2; idx >= 0; --idx) {
     if (reduce_axes[idx] == reduce_axes[idx + 1] - 1) {
-      succesive_reduce_dimension *= reduce_shape[reduce_axes[idx]];
+      successive_reduce_dimension *= reduce_shape[reduce_axes[idx]];
       continue;
     }
     break;
   }
 
   return helper->target_ == cinn::common::DefaultNVGPUTarget()
-             ? (succesive_reduce_dimension <= helper->target_.max_num_threads()
+             ? (successive_reduce_dimension <= helper->target_.max_num_threads()
                     ? true
                     : false)
              : true;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc
index dc36f40d9c6a3..c5a838bc66f8f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.cc
@@ -169,11 +169,11 @@ bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr subgraph_ptr) {
     if (!consumer->substitute) {
       continue;
     }
-    // fast depency check.
+    // fast dependency check.
     if (IsDependencySimplify(producer, consumer, consumers)) {
       continue;
     }
-    // global depency check.
+    // global dependency check.
     if (IsDependency(producer, consumer, consumers)) {
       continue;
     }
@@ -196,7 +196,7 @@ bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr subgraph_ptr) {
     producer->node_set.insert(candidate->node_set.begin(),
                               candidate->node_set.end());
 
-    // update bound for check depency
+    // update bound for check dependency
     producer->max_depth = std::max(producer->max_depth, candidate->max_depth);
     producer->min_depth = std::min(producer->min_depth, candidate->min_depth);
 
@@ -219,7 +219,7 @@ bool CinnSubgraphDetector::FuseSubGraph(CinnSubGraphPtr subgraph_ptr) {
       tmp->producers.erase(candidate);
     }
 
-    // remove candicate in producer/consumer
+    // remove candidate in producer/consumer
     producer->producers.erase(candidate);
     producer->consumers.erase(candidate);
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h
index e8ff3915c8511..7b02761b9e855 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_subgraph_detector.h
@@ -78,7 +78,7 @@ class CinnSubgraphDetector {
   // SubGraph Fusion
   void DoSubGraphFusion();
   bool FuseSubGraph(CinnSubGraphPtr);
-  // check exist depency.
+  // check exist dependency.
   bool IsDependency(const CinnSubGraphPtr &,
                     const CinnSubGraphPtr &,
                     const std::unordered_set<CinnSubGraphPtr> &);

From 7bfde2483b18998d2fb89a5fff8ff6b10f8d1669 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Wed, 6 Mar 2024 18:26:20 +0800
Subject: [PATCH 027/114]  Fix GetFusableConsumerGroupLists
 GetFusibleConsumerGroupLists, etc (#62459)

---
 .../group_with_group_merge_pass.cc            | 32 +++++++++----------
 paddle/cinn/hlir/framework/op_lowering_impl.h |  4 +--
 .../hlir/framework/op_lowering_impl_base.h    |  4 +--
 .../cinn/hlir/framework/op_lowering_util.cc   |  2 +-
 .../hlir/framework/pir/op_lowering_impl.h     |  4 +--
 paddle/cinn/hlir/pass/fusion_merge_pass.cc    |  2 +-
 .../hlir/pass/general_fusion_merge_pass.cc    | 32 +++++++++----------
 paddle/cinn/hlir/pass/opfusion.cc             | 10 +++---
 paddle/cinn/hlir/pass/reduce_split_pass.cc    |  2 +-
 .../hlir/pass/single_group_optimize_pass.cc   |  2 +-
 10 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
index 4b5f65747e929..81606a320cdcc 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
@@ -1328,7 +1328,7 @@ class GeneralFusionMergePassHelper {
   bool GeneralHorizontalFuse(const GroupPtr& producer) {
     VLOG(3) << "GeneralHorizontalFuse handling producer : "
             << producer->group_id;
-    const auto& GetFusableConsumerGroupLists =
+    const auto& GetFusibleConsumerGroupLists =
         [&]() -> std::vector<OpGroupList> {
       std::vector<OpGroupList> tagged_lists;
       const auto& MarkFusible = [&](const OpGroupList& candidates) {
@@ -1339,8 +1339,8 @@ class GeneralFusionMergePassHelper {
       EnableFusedHorizontalGroups(&fuse_ctx);
       return tagged_lists;
     };
-    const auto& GetFusableConsumerGroupList = [&]() -> std::vector<GroupList> {
-      const auto& group_lists = GetFusableConsumerGroupLists();
+    const auto& GetFusibleConsumerGroupList = [&]() -> std::vector<GroupList> {
+      const auto& group_lists = GetFusibleConsumerGroupLists();
       if (group_lists.empty()) {
         return std::vector<GroupList>{};
       }
@@ -1355,7 +1355,7 @@ class GeneralFusionMergePassHelper {
       return ret;
     };
 
-    const auto& group_lists = GetFusableConsumerGroupList();
+    const auto& group_lists = GetFusibleConsumerGroupList();
     if (group_lists.empty()) {
       return false;
     }
@@ -1387,7 +1387,7 @@ class GeneralFusionMergePassHelper {
   bool CallGeneralInputFusePass(
       const std::unordered_set<GroupPtr, Hasher, Comparator>& consumers) {
     VLOG(3) << "CallGeneralInputFusePass...!";
-    const auto& GetFusableConsumerGroupLists =
+    const auto& GetFusibleConsumerGroupLists =
         [&]() -> std::vector<OpGroupList> {
       std::vector<OpGroupList> tagged_lists;
       const auto& MarkFusible = [&](const OpGroupList& candidates) {
@@ -1402,8 +1402,8 @@ class GeneralFusionMergePassHelper {
       EnableFusedInputGroups(&fuse_ctx);
       return tagged_lists;
     };
-    const auto& GetFusableConsumerGroupList = [&]() -> std::vector<GroupList> {
-      const auto& group_lists = GetFusableConsumerGroupLists();
+    const auto& GetFusibleConsumerGroupList = [&]() -> std::vector<GroupList> {
+      const auto& group_lists = GetFusibleConsumerGroupLists();
       if (group_lists.empty()) {
         return std::vector<GroupList>{};
       }
@@ -1418,7 +1418,7 @@ class GeneralFusionMergePassHelper {
       return ret;
     };
 
-    const auto& group_lists = GetFusableConsumerGroupList();
+    const auto& group_lists = GetFusibleConsumerGroupList();
     if (group_lists.empty()) {
       return false;
     }
@@ -1613,7 +1613,7 @@ class GeneralFusionMergePassHelper {
   bool GeneralVerticalFuse(const GroupPtr& producer) {
     VLOG(3) << "GeneralVerticalFuse...!";
     using GroupSets = std::vector<std::pair<OpGroupPtr, OpGroupPtr>>;
-    const auto& GetFusableConsumerOpGroupSets = [&]() -> GroupSets {
+    const auto& GetFusibleConsumerOpGroupSets = [&]() -> GroupSets {
       GroupSets tagged_sets;
       const auto& MarkFusible = [&](const OpGroupPtr& first,
                                     const OpGroupPtr& second) {
@@ -1625,9 +1625,9 @@ class GeneralFusionMergePassHelper {
       return tagged_sets;
     };
 
-    auto GetFusableConsumerGroupSet =
+    auto GetFusibleConsumerGroupSet =
         [&]() -> std::unordered_set<GroupPtr, Hasher, Comparator> {
-      const auto& group_sets = GetFusableConsumerOpGroupSets();
+      const auto& group_sets = GetFusibleConsumerOpGroupSets();
       if (group_sets.empty()) {
         return {};
       }
@@ -1639,7 +1639,7 @@ class GeneralFusionMergePassHelper {
     };
 
     bool update = false;
-    auto consumer_groups = GetFusableConsumerGroupSet();
+    auto consumer_groups = GetFusibleConsumerGroupSet();
     if (consumer_groups.size()) {
       SelectConsumerToFuse(producer, &consumer_groups);
     }
@@ -1868,7 +1868,7 @@ class GeneralFusionMergePassHelper {
     VLOG(3) << "GeneralRecomputeFuse handling producer : "
             << producer->group_id;
     using GroupSets = std::set<std::pair<OpGroupPtr, OpGroupPtr>>;
-    const auto& GetFusableConsumerOpGroupSets = [&]() -> GroupSets {
+    const auto& GetFusibleConsumerOpGroupSets = [&]() -> GroupSets {
       GroupSets tagged_sets;
       const auto& MarkFusible = [&](const OpGroupPtr& first,
                                     const OpGroupPtr& second) {
@@ -1880,9 +1880,9 @@ class GeneralFusionMergePassHelper {
       return tagged_sets;
     };
 
-    auto GetFusableConsumerGroupSet =
+    auto GetFusibleConsumerGroupSet =
         [&]() -> std::unordered_set<GroupPtr, Hasher, Comparator> {
-      const auto& group_sets = GetFusableConsumerOpGroupSets();
+      const auto& group_sets = GetFusibleConsumerOpGroupSets();
       if (group_sets.empty()) {
         return {};
       }
@@ -1894,7 +1894,7 @@ class GeneralFusionMergePassHelper {
     };
 
     bool update = false;
-    auto consumer_groups = GetFusableConsumerGroupSet();
+    auto consumer_groups = GetFusibleConsumerGroupSet();
     if (consumer_groups.size() > 0) {
       CHECK(consumer_groups.size() == producer->mut_consumer_groups()->size())
           << "Recompute requires fuse all consumers!";
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.h b/paddle/cinn/hlir/framework/op_lowering_impl.h
index 80c79b3c64b8d..ef18def90affc 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl.h
@@ -28,9 +28,9 @@
 #include "paddle/cinn/lang/packed_func.h"
 
 // Fusion Op lowering, there are four kinds of lowering function:
-// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
+// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusible,NonFusible.
 // Elementwise/Broadcast/Injective Ops is with same schedule.
-// Reduce,OutEWiseFusable,NonFusible are using different schedule.
+// Reduce,OutEWiseFusible,NonFusible are using different schedule.
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
index edd5c6e8e627e..4d5284f22f6ed 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
@@ -19,9 +19,9 @@
 #include "paddle/cinn/ir/lowered_func.h"
 
 // Fusion Op lowering, there are four kinds of lowering function:
-// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
+// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusible,NonFusible.
 // Elementwise/Broadcast/Injective Ops is with same schedule.
-// Reduce,OutEWiseFusable,NonFusible are using different schedule.
+// Reduce,OutEWiseFusible,NonFusible are using different schedule.
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
index 2366fd584aa0b..ed9e29d7ac8d6 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -805,7 +805,7 @@ void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
       ir_sch.Fuse(block_name, {axes[index + 1], axes[index + 1] + 1});
     }
     LoopOrderAssignReduce(ir_sch, block_name, first_axes, target, true);
-    // fuse axis before reduce to bind blockidx.
+    // fuse axis before reduce to bind block idx.
     for (int idx = 0; idx < static_cast<int>(inshape.size() - axes.size()) - 1;
          ++idx) {
       ir_sch.Fuse(block_name, {0, 1});
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
index c449e7dcc2efa..ad61d045d3ea0 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.h
@@ -30,9 +30,9 @@
 #include "paddle/pir/include/core/operation.h"
 
 // Fusion Op lowering, there are four kinds of lowering function:
-// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
+// Elementwise/Broadcast/Injective,Reduce,OutEWiseFusible,NonFusible.
 // Elementwise/Broadcast/Injective Ops is with same schedule.
-// Reduce,OutEWiseFusable,NonFusible are using different schedule.
+// Reduce,OutEWiseFusible,NonFusible are using different schedule.
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
index 9381ba0f5b2f3..472cbd9a07e07 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
@@ -55,7 +55,7 @@ class FusionMergePassHelper : public FusionHelperBase {
   }
 
   GroupList operator()() {
-    // run fusion merge untill no update.
+    // run fusion merge until no update.
     DoFusionMerge();
     for (auto& group : fusion_groups_) {
       VLOG(3) << "Fusion Group -> " << group->group_id;
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
index d527223cff158..bf0ffd2265362 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
@@ -244,7 +244,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
   bool GeneralHorizontalFuse(const GroupPtr& producer) {
     VLOG(3) << "GeneralHorizontalFuse handling producer : "
             << producer->group_id;
-    const auto& GetFusableConsumerGroupLists =
+    const auto& GetFusibleConsumerGroupLists =
         [&]() -> std::vector<OpGroupList> {
       std::vector<OpGroupList> tagged_lists;
       const auto& MarkFusible = [&](const OpGroupList& candidates) {
@@ -255,8 +255,8 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
       EnableFusedHorizontalGroups(&fuse_ctx);
       return tagged_lists;
     };
-    const auto& GetFusableConsumerGroupList = [&]() -> std::vector<GroupList> {
-      const auto& group_lists = GetFusableConsumerGroupLists();
+    const auto& GetFusibleConsumerGroupList = [&]() -> std::vector<GroupList> {
+      const auto& group_lists = GetFusibleConsumerGroupLists();
       if (group_lists.empty()) {
         return std::vector<GroupList>{};
       }
@@ -271,7 +271,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
       return ret;
     };
 
-    const auto& group_lists = GetFusableConsumerGroupList();
+    const auto& group_lists = GetFusibleConsumerGroupList();
     if (group_lists.empty()) {
       return false;
     }
@@ -303,7 +303,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
   bool CallGeneralInputFusePass(
       const std::unordered_set<GroupPtr, Hasher, Comparator>& consumers) {
     VLOG(3) << "CallGeneralInputFusePass...!";
-    const auto& GetFusableConsumerGroupLists =
+    const auto& GetFusibleConsumerGroupLists =
         [&]() -> std::vector<OpGroupList> {
       std::vector<OpGroupList> tagged_lists;
       const auto& MarkFusible = [&](const OpGroupList& candidates) {
@@ -318,8 +318,8 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
       EnableFusedInputGroups(&fuse_ctx);
       return tagged_lists;
     };
-    const auto& GetFusableConsumerGroupList = [&]() -> std::vector<GroupList> {
-      const auto& group_lists = GetFusableConsumerGroupLists();
+    const auto& GetFusibleConsumerGroupList = [&]() -> std::vector<GroupList> {
+      const auto& group_lists = GetFusibleConsumerGroupLists();
       if (group_lists.empty()) {
         return std::vector<GroupList>{};
       }
@@ -334,7 +334,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
       return ret;
     };
 
-    const auto& group_lists = GetFusableConsumerGroupList();
+    const auto& group_lists = GetFusibleConsumerGroupList();
     if (group_lists.empty()) {
       return false;
     }
@@ -522,7 +522,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
   bool GeneralVerticalFuse(const GroupPtr& producer) {
     VLOG(3) << "GeneralVerticalFuse...!";
     using GroupSets = std::vector<std::pair<OpGroupPtr, OpGroupPtr>>;
-    const auto& GetFusableConsumerOpGroupSets = [&]() -> GroupSets {
+    const auto& GetFusibleConsumerOpGroupSets = [&]() -> GroupSets {
       GroupSets tagged_sets;
       const auto& MarkFusible = [&](const OpGroupPtr& first,
                                     const OpGroupPtr& second) {
@@ -534,9 +534,9 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
       return tagged_sets;
     };
 
-    auto GetFusableConsumerGroupSet =
+    auto GetFusibleConsumerGroupSet =
         [&]() -> std::unordered_set<GroupPtr, Hasher, Comparator> {
-      const auto& group_sets = GetFusableConsumerOpGroupSets();
+      const auto& group_sets = GetFusibleConsumerOpGroupSets();
       if (group_sets.empty()) {
         return {};
       }
@@ -548,7 +548,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
     };
 
     bool update = false;
-    auto consumer_groups = GetFusableConsumerGroupSet();
+    auto consumer_groups = GetFusibleConsumerGroupSet();
     if (consumer_groups.size()) {
       SelectConsumerToFuse(producer, &consumer_groups);
     }
@@ -771,7 +771,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
     VLOG(3) << "GeneralRecomputeFuse handling producer : "
             << producer->group_id;
     using GroupSets = std::set<std::pair<OpGroupPtr, OpGroupPtr>>;
-    const auto& GetFusableConsumerOpGroupSets = [&]() -> GroupSets {
+    const auto& GetFusibleConsumerOpGroupSets = [&]() -> GroupSets {
       GroupSets tagged_sets;
       const auto& MarkFusible = [&](const OpGroupPtr& first,
                                     const OpGroupPtr& second) {
@@ -783,9 +783,9 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
       return tagged_sets;
     };
 
-    auto GetFusableConsumerGroupSet =
+    auto GetFusibleConsumerGroupSet =
         [&]() -> std::unordered_set<GroupPtr, Hasher, Comparator> {
-      const auto& group_sets = GetFusableConsumerOpGroupSets();
+      const auto& group_sets = GetFusibleConsumerOpGroupSets();
       if (group_sets.empty()) {
         return {};
       }
@@ -797,7 +797,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
     };
 
     bool update = false;
-    auto consumer_groups = GetFusableConsumerGroupSet();
+    auto consumer_groups = GetFusibleConsumerGroupSet();
     if (consumer_groups.size() > 0) {
       CHECK(consumer_groups.size() == producer->mut_consumer_groups()->size())
           << "Recompute requires fuse all consumers!";
diff --git a/paddle/cinn/hlir/pass/opfusion.cc b/paddle/cinn/hlir/pass/opfusion.cc
index 537b9abb45881..b4e2eec247f21 100644
--- a/paddle/cinn/hlir/pass/opfusion.cc
+++ b/paddle/cinn/hlir/pass/opfusion.cc
@@ -83,7 +83,7 @@ class DomTree {
       const std::vector<GraphNode*>& nodes) {
     int size = nodes.size();
     dom_nodes_.resize(nodes.size());
-    // construct postdom tree, reverse topological_order
+    // construct post dom tree, reverse topological_order
     for (int i = size - 1; i >= 0; i--) {
       auto* dom_node = CreateDomNode(nodes[i]);
       CHECK(dom_node);
@@ -160,7 +160,7 @@ class DomTree {
           parent = dom_node;
           CHECK(parent);
         } else {
-          // if the out_var links to more than one opnode, then we need to find
+          // if the out_var links to more than one op_node, then we need to find
           // the LCA
           parent = LCA(parent, dom_node, pattern);
         }
@@ -170,7 +170,7 @@ class DomTree {
         VLOG(2) << sink->id() << "'s op pattern is " << op_pattern;
         if (op_node->attrs.attr_store.count("pre_run") &&
             absl::get<bool>(op_node->attrs.attr_store["pre_run"])) {
-          // not fuse pre_run opnode
+          // not fuse pre_run op_node
           op_pattern = framework::kNonFusible;
           VLOG(3) << op_node->op()->name << " do pre_run and not fuse";
         }
@@ -264,7 +264,7 @@ class GraphPartition {
         auto pattern = op_pattern_dict[op_node->op()];
         if (op_node->attrs.attr_store.count("pre_run") &&
             absl::get<bool>(op_node->attrs.attr_store["pre_run"])) {
-          // not fuse pre_run opnode
+          // not fuse pre_run op_node
           pattern = framework::kNonFusible;
           VLOG(3) << op_node->op()->name << " do pre_run and not fuse";
         }
@@ -549,7 +549,7 @@ class GraphPartition {
 void OpFusionPass(Graph* graph) {
   auto store_nodes = std::get<0>(graph->topological_order());
   int node_size = store_nodes.size();
-  // construct postdom tree, reverse topological_order
+  // construct post dom tree, reverse topological_order
   DomTree tree;
   auto& dom_nodes = tree.CreatePostDomTree(store_nodes);
   // graph partition
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass.cc b/paddle/cinn/hlir/pass/reduce_split_pass.cc
index 1f8c500cc9be0..899c233866ca5 100644
--- a/paddle/cinn/hlir/pass/reduce_split_pass.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass.cc
@@ -71,7 +71,7 @@ uint32_t NextPowerOf2(uint32_t n) {
 
 class ReduceSplitPass {
  public:
-  // Find the reduce op with nwhc format and large shape, split it into two ops
+  // Find the reduce op with NWHC format and large shape, split it into two ops
   static int Apply(framework::Graph* graph) {
     int MAX_NUM_THREADS = cinn::common::DefaultNVGPUTarget().max_num_threads();
     constexpr int MAX_ITER_PER_THREAD = 32;  // empirical value
diff --git a/paddle/cinn/hlir/pass/single_group_optimize_pass.cc b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
index 816943b38cee0..db67b990cd76e 100644
--- a/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
+++ b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
@@ -201,7 +201,7 @@ void SingleGroupOptimizePass::InitNodeToGroups() {
 
 CINN_REGISTER_HELPER(SingleGroupOptimizePass) {
   CINN_REGISTER_PASS(SingleGroupOptimizePass)
-      .describe("Optimize singel group to improve performance.")
+      .describe("Optimize single group to improve performance.")
       .set_change_structure(true)
       .set_body(cinn::hlir::pass::SingleGroupOptimizePassImpl);
 

From 2ca34a759a255660844914004f2b8b59057ce0fe Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Wed, 6 Mar 2024 20:28:45 +0800
Subject: [PATCH 028/114] [PIR] Support wrap_type_interface for
 AlloctedDenseTensorType AllocatedSelectedRowsType and
 AllocatedDenseTensorArrayType (#62451)

* refine code

* fix
---
 .../pir/dialect/kernel/ir/kernel_type.cc      |  12 +
 .../fluid/pir/dialect/kernel/ir/kernel_type.h |  15 +-
 .../dialect/op_generator/op_infermeta_gen.py  |  39 ---
 .../dialect/operator/ir/control_flow_op.cc    |  15 +-
 .../dialect/operator/ir/manual_onednn_op.cc   |   9 -
 .../pir/dialect/operator/ir/manual_op.cc      | 326 +-----------------
 .../fluid/pir/dialect/operator/ir/op_type.cc  |  41 +++
 .../fluid/pir/dialect/operator/ir/op_type.h   |  16 +
 .../fluid/pir/dialect/operator/utils/utils.cc |  59 +---
 paddle/pir/src/core/builtin_type.cc           |   2 +
 10 files changed, 93 insertions(+), 441 deletions(-)

diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_type.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_type.cc
index f293bd5cf9baa..ef3a9a7c0b307 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_type.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_type.cc
@@ -17,6 +17,10 @@
 namespace paddle {
 namespace dialect {
 
+pir::Type AllocatedDenseTensorType::prim_type() {
+  return storage()->dense_tensor_type_;
+}
+
 const phi::Place& AllocatedDenseTensorType::place() const {
   return storage()->place_;
 }
@@ -41,6 +45,10 @@ size_t AllocatedDenseTensorType::offset() const {
   return storage()->dense_tensor_type_.offset();
 }
 
+pir::Type AllocatedSelectedRowsType::prim_type() {
+  return storage()->selected_rows_type_;
+}
+
 const phi::Place& AllocatedSelectedRowsType::place() const {
   return storage()->place_;
 }
@@ -65,6 +73,10 @@ size_t AllocatedSelectedRowsType::offset() const {
   return storage()->selected_rows_type_.offset();
 }
 
+pir::Type AllocatedDenseTensorArrayType::prim_type() {
+  return storage()->dense_tensor_array_type_;
+}
+
 const phi::Place& AllocatedDenseTensorArrayType::place() const {
   return storage()->place_;
 }
diff --git a/paddle/fluid/pir/dialect/kernel/ir/kernel_type.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_type.h
index f8595c6ec68df..8bfdf0bae7906 100644
--- a/paddle/fluid/pir/dialect/kernel/ir/kernel_type.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_type.h
@@ -24,7 +24,8 @@ namespace dialect {
 class AllocatedDenseTensorType
     : public pir::Type::TypeBase<AllocatedDenseTensorType,
                                  pir::Type,
-                                 AllocatedDenseTensorTypeStorage> {
+                                 AllocatedDenseTensorTypeStorage,
+                                 pir::WrapTypeInterface> {
  public:
   using Base::Base;
 
@@ -49,6 +50,8 @@ class AllocatedDenseTensorType
         ctx, place, dense_tensor_type);
   }
 
+  pir::Type prim_type();
+
   const phi::Place &place() const;
 
   pir::Type dtype() const;
@@ -65,7 +68,8 @@ class AllocatedDenseTensorType
 class AllocatedSelectedRowsType
     : public pir::Type::TypeBase<AllocatedSelectedRowsType,
                                  pir::Type,
-                                 AllocatedSelectedRowsTypeStorage> {
+                                 AllocatedSelectedRowsTypeStorage,
+                                 pir::WrapTypeInterface> {
  public:
   using Base::Base;
 
@@ -90,6 +94,8 @@ class AllocatedSelectedRowsType
         ctx, place, type);
   }
 
+  pir::Type prim_type();
+
   const phi::Place &place() const;
 
   pir::Type dtype() const;
@@ -106,7 +112,8 @@ class AllocatedSelectedRowsType
 class AllocatedDenseTensorArrayType
     : public pir::Type::TypeBase<AllocatedDenseTensorArrayType,
                                  pir::Type,
-                                 AllocatedDenseTensorArrayTypeStorage> {
+                                 AllocatedDenseTensorArrayTypeStorage,
+                                 pir::WrapTypeInterface> {
  public:
   using Base::Base;
 
@@ -129,6 +136,8 @@ class AllocatedDenseTensorArrayType
         ctx, place, type);
   }
 
+  pir::Type prim_type();
+
   const phi::Place &place() const;
 
   const pir::Type &dtype() const;
diff --git a/paddle/fluid/pir/dialect/op_generator/op_infermeta_gen.py b/paddle/fluid/pir/dialect/op_generator/op_infermeta_gen.py
index 500e36881b3f1..50648daeeec30 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_infermeta_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_infermeta_gen.py
@@ -44,15 +44,6 @@
   {type} {name};
   if ({name}_.type().isa<{type}>()) {{
     {name} = {name}_.type().dyn_cast<{type}>(); (void){name};
-  }} else if ({name}_.type().isa<{allocated_type}>()) {{
-    {allocated_type} allocated_{name} = {name}_.type().dyn_cast<{allocated_type}>();
-    {name} = {type}::get(pir::IrContext::Instance(),
-                                            allocated_{name}.dtype(),
-                                            allocated_{name}.dims(),
-                                            allocated_{name}.data_layout(),
-                                            allocated_{name}.lod(),
-                                            allocated_{name}.offset());
-    (void){name};
   }} else {{
     PADDLE_THROW(phi::errors::Unimplemented("Only support {type} or {allocated_type}"));
   }}
@@ -158,20 +149,11 @@ def GenBuildOutputsPart2(
   paddle::dialect::IrMetaTensor meta_{name};
   paddle::dialect::IrTensor ir_tensor_{name};
 
-
   if ({name}_.impl() != nullptr) {{
     VLOG(4) << "Builder construction  dense_{name}";
     {type} {name};
     if ({name}_.type().isa<{type}>()) {{
       {name} = {name}_.type().dyn_cast<{type}>();
-    }} else if ({name}_.type().isa<{allocated_type}>()) {{
-      {allocated_type} allocated_{name} = {name}_.type().dyn_cast<{allocated_type}>();
-      {name} = {type}::get(pir::IrContext::Instance(),
-                            allocated_{name}.dtype(),
-                            allocated_{name}.dims(),
-                            allocated_{name}.data_layout(),
-                            allocated_{name}.lod(),
-                            allocated_{name}.offset());
     }} else {{
       PADDLE_THROW(phi::errors::Unimplemented("Only support {type} or {allocated_type}"));
     }}
@@ -195,13 +177,6 @@ def GenBuildOutputsPart2(
                                                                     {name}_type.data_layout(),
                                                                     {name}_type.lod(),
                                                                     {name}_type.offset()));
-    }} else if({name}[i].isa<paddle::dialect::AllocatedDenseTensorType>()){{
-        auto {name}_type = {name}[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-        vec_ir_tensor_{name}.push_back(paddle::dialect::IrTensor(paddle::dialect::TransToPhiDataType({name}_type.dtype()),
-                                                                    {name}_type.dims(),
-                                                                    {name}_type.data_layout(),
-                                                                    {name}_type.lod(),
-                                                                    {name}_type.offset()));
     }} else {{
         PADDLE_THROW(phi::errors::Unimplemented("Only support DenseTensorType or AllocatedDenseTensorType"));
     }}
@@ -228,13 +203,6 @@ def GenBuildOutputsPart2(
                                                                         {name}_type.data_layout(),
                                                                         {name}_type.lod(),
                                                                         {name}_type.offset()));
-        }} else if({name}[i].isa<paddle::dialect::AllocatedDenseTensorType>()){{
-          auto {name}_type = {name}[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-          vec_ir_tensor_{name}.push_back(paddle::dialect::IrTensor(paddle::dialect::TransToPhiDataType({name}_type.dtype()),
-                                                                        {name}_type.dims(),
-                                                                        {name}_type.data_layout(),
-                                                                        {name}_type.lod(),
-                                                                        {name}_type.offset()));
         }} else {{
             PADDLE_THROW(phi::errors::Unimplemented("Only support DenseTensorType or AllocatedDenseTensorType"));
         }}
@@ -273,13 +241,6 @@ def GenBuildOutputsPart2(
       {name}_size = 1;
     }}
     {name} = std::vector<int64_t>({name}_size, -1);
-  }} else if ({name}_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {{
-    common::DDim {name}_dim = {name}_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>().dims();
-    size_t {name}_size = common::product({name}_dim);
-    if (common::contain_unknown_dim({name}_dim)) {{
-      {name}_size = 1;
-    }}
-    {name} = std::vector<int64_t>({name}_size, -1);
   }} else {{
     PADDLE_THROW(phi::errors::Unimplemented("Only support VectorType or DenseTensorType or AllocatedDenseTensorType"));
   }}\n"""
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
index 60d589773d5bb..e1dc458cb652f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -575,14 +575,6 @@ void WhileOp::VerifySig() {
         phi::errors::PreconditionNotMet(
             "Type validation failed for the 0th input, it should be a "
             "bool DenseTensorType."));
-  } else if (auto cond_type =
-                 operand_type(0).dyn_cast<AllocatedDenseTensorType>()) {
-    PADDLE_ENFORCE_EQ(
-        cond_type.dtype().isa<pir::BoolType>(),
-        true,
-        phi::errors::PreconditionNotMet(
-            "Type validation failed for the 0th input, it should be a "
-            "bool DenseTensorType."));
   } else {
     PADDLE_THROW(phi::errors::PreconditionNotMet(
         "Currently,  the while op cond input only support bool dense_tensor "
@@ -803,8 +795,7 @@ void HasElementsOp::VerifySig() {
 
   // Verify outputs:
   IR_ENFORCE(num_results() == 1u, "The size of outputs must be equal to 1.");
-  IR_ENFORCE((*this)->result_type(0).isa<DenseTensorType>() ||
-                 (*this)->result_type(0).isa<AllocatedDenseTensorType>(),
+  IR_ENFORCE((*this)->result_type(0).isa<DenseTensorType>(),
              "The type of cf.has_elements' output is not correct.");
 }
 
@@ -874,8 +865,7 @@ void AssertOp::VerifySig() {
             (*this)->operand(1).type().dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
         IR_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
-                       vec_type[i].isa<paddle::dialect::SelectedRowsType>() ||
-                       vec_type[i].isa<AllocatedDenseTensorType>(),
+                       vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
                    "Type validation failed for the 1th input.");
       }
     } else {
@@ -885,7 +875,6 @@ void AssertOp::VerifySig() {
                   ->operand(1)
                   .type()
                   .isa<paddle::dialect::SelectedRowsType>(),
-          (*this)->operand(1).type().isa<AllocatedDenseTensorType>(),
           "Type validation failed for the 1th input.");
     }
   }
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.cc
index a66d4d8eb8b51..6ee537d1ee1a7 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_onednn_op.cc
@@ -255,15 +255,6 @@ std::vector<pir::Type> ExpandOp::InferMeta(
   paddle::dialect::DenseTensorType x;
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_x =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    x = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_x.dtype(),
-                                              allocated_x.dims(),
-                                              allocated_x.data_layout(),
-                                              allocated_x.lod(),
-                                              allocated_x.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index f8e02c5b52d6d..c673ece8fdf46 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -166,16 +166,6 @@ std::vector<pir::Type> AddNOp::InferMeta(
           x[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
           x[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
           x[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
-    } else if (x[i].isa<paddle::dialect::AllocatedDenseTensorType>()) {
-      vec_dense_x.push_back(paddle::dialect::IrTensor(
-          TransToPhiDataType(
-              x[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-                  .dtype()),
-          x[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>().dims(),
-          x[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-              .data_layout(),
-          x[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>().lod(),
-          x[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>().offset()));
     } else {
       PADDLE_THROW(phi::errors::Unimplemented(
           "Only support paddle::dialect::DenseTensorType or "
@@ -321,22 +311,6 @@ std::vector<pir::Type> AddN_Op::InferMeta(
           inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
           inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
           inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
-    } else if (inputs[i].isa<paddle::dialect::AllocatedDenseTensorType>()) {
-      vec_dense_inputs.push_back(paddle::dialect::IrTensor(
-          TransToPhiDataType(
-              inputs[i]
-                  .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-                  .dtype()),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-              .dims(),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-              .data_layout(),
-          inputs[i].dyn_cast<paddle::dialect::AllocatedDenseTensorType>().lod(),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-              .offset()));
     } else {
       PADDLE_THROW(phi::errors::Unimplemented(
           "Only support paddle::dialect::DenseTensorType or "
@@ -489,18 +463,6 @@ std::vector<pir::Type> AddNArrayOp::InferMeta(
               .dyn_cast<paddle::dialect::DenseTensorArrayType>()
               .data_layout(),
           {}));
-    } else if (inputs[i]
-                   .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-      vec_dense_inputs.push_back(paddle::dialect::IrTensor(
-          TransToPhiDataType(
-              inputs[i]
-                  .dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>()
-                  .dtype()),
-          inputs[i].dyn_cast<paddle::dialect::DenseTensorArrayType>().dims(),
-          inputs[i]
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>()
-              .data_layout(),
-          {}));
     } else {
       PADDLE_THROW(phi::errors::Unimplemented(
           "Only support paddle::dialect::DenseTensorArrayType or "
@@ -732,15 +694,6 @@ std::vector<pir::Type> FusedGemmEpilogueOp::InferMeta(
   paddle::dialect::DenseTensorType x;
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_x =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    x = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_x.dtype(),
-                                              allocated_x.dims(),
-                                              allocated_x.data_layout(),
-                                              allocated_x.lod(),
-                                              allocated_x.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -750,15 +703,6 @@ std::vector<pir::Type> FusedGemmEpilogueOp::InferMeta(
   paddle::dialect::DenseTensorType y;
   if (y_.type().isa<paddle::dialect::DenseTensorType>()) {
     y = y_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (y_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_y =
-        y_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    y = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_y.dtype(),
-                                              allocated_y.dims(),
-                                              allocated_y.data_layout(),
-                                              allocated_y.lod(),
-                                              allocated_y.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -768,15 +712,6 @@ std::vector<pir::Type> FusedGemmEpilogueOp::InferMeta(
   paddle::dialect::DenseTensorType bias;
   if (bias_.type().isa<paddle::dialect::DenseTensorType>()) {
     bias = bias_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (bias_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_bias =
-        bias_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    bias = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                                 allocated_bias.dtype(),
-                                                 allocated_bias.dims(),
-                                                 allocated_bias.data_layout(),
-                                                 allocated_bias.lod(),
-                                                 allocated_bias.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -1006,15 +941,6 @@ std::vector<pir::Type> FusedGemmEpilogueGradOp::InferMeta(
   paddle::dialect::DenseTensorType x;
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_x =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    x = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_x.dtype(),
-                                              allocated_x.dims(),
-                                              allocated_x.data_layout(),
-                                              allocated_x.lod(),
-                                              allocated_x.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -1024,15 +950,6 @@ std::vector<pir::Type> FusedGemmEpilogueGradOp::InferMeta(
   paddle::dialect::DenseTensorType y;
   if (y_.type().isa<paddle::dialect::DenseTensorType>()) {
     y = y_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (y_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_y =
-        y_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    y = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_y.dtype(),
-                                              allocated_y.dims(),
-                                              allocated_y.data_layout(),
-                                              allocated_y.lod(),
-                                              allocated_y.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -1044,18 +961,6 @@ std::vector<pir::Type> FusedGemmEpilogueGradOp::InferMeta(
     if (reserve_space_.type().isa<paddle::dialect::DenseTensorType>()) {
       reserve_space =
           reserve_space_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-    } else if (reserve_space_.type()
-                   .isa<paddle::dialect::AllocatedDenseTensorType>()) {
-      paddle::dialect::AllocatedDenseTensorType allocated_reserve_space =
-          reserve_space_.type()
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-      reserve_space = paddle::dialect::DenseTensorType::get(
-          pir::IrContext::Instance(),
-          allocated_reserve_space.dtype(),
-          allocated_reserve_space.dims(),
-          allocated_reserve_space.data_layout(),
-          allocated_reserve_space.lod(),
-          allocated_reserve_space.offset());
     } else {
       PADDLE_THROW(phi::errors::Unimplemented(
           "Only support paddle::dialect::DenseTensorType or "
@@ -1068,17 +973,6 @@ std::vector<pir::Type> FusedGemmEpilogueGradOp::InferMeta(
   paddle::dialect::DenseTensorType out_grad;
   if (out_grad_.type().isa<paddle::dialect::DenseTensorType>()) {
     out_grad = out_grad_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (out_grad_.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_out_grad =
-        out_grad_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    out_grad =
-        paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_out_grad.dtype(),
-                                              allocated_out_grad.dims(),
-                                              allocated_out_grad.data_layout(),
-                                              allocated_out_grad.lod(),
-                                              allocated_out_grad.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -1579,16 +1473,6 @@ std::vector<pir::Type> CreateArrayLikeOp::InferMeta(
   if (input_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     input_type =
         input_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (input_.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        input_.type()
-            .dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    input_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -1708,14 +1592,6 @@ std::vector<pir::Type> ArrayLengthOp::InferMeta(
   paddle::dialect::DenseTensorArrayType x_type;
   if (x_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     x_type = x_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    x_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -1875,16 +1751,6 @@ std::vector<pir::Type> ArrayReadOp::InferMeta(
   if (array_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     array_type =
         array_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (array_.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        array_.type()
-            .dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    array_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -2054,16 +1920,6 @@ std::vector<pir::Type> ArrayWrite_Op::InferMeta(
   if (array_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     array_type =
         array_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (array_.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        array_.type()
-            .dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    array_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -2081,17 +1937,6 @@ std::vector<pir::Type> ArrayWrite_Op::InferMeta(
   phi::Place place = phi::CPUPlace();
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x_type = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    place = allocated_input.place(),
-    x_type =
-        paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_input.dtype(),
-                                              allocated_input.dims(),
-                                              allocated_input.data_layout(),
-                                              allocated_input.lod(),
-                                              allocated_input.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -2119,20 +1964,19 @@ std::vector<pir::Type> ArrayWrite_Op::InferMeta(
       dense_array.layout());
   // update array's dims as x's dims.
   // TOOD(chenxi67) Do not change if dim is set by custom
-  if (array_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
-    array_.set_type(
-        paddle::dialect::DenseTensorArrayType::get(pir::IrContext::Instance(),
-                                                   array_type.dtype(),
-                                                   x_type.dims(),
-                                                   array_type.data_layout()));
-  } else if (array_.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
+  if (array_.type().isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
     array_.set_type(paddle::dialect::AllocatedDenseTensorArrayType::get(
         pir::IrContext::Instance(),
         place,
         array_type.dtype(),
         x_type.dims(),
         array_type.data_layout()));
+  } else if (array_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
+    array_.set_type(
+        paddle::dialect::DenseTensorArrayType::get(pir::IrContext::Instance(),
+                                                   array_type.dtype(),
+                                                   x_type.dims(),
+                                                   array_type.data_layout()));
   }
 
   argument_outputs.push_back(out_type);
@@ -2275,14 +2119,6 @@ std::vector<pir::Type> ArrayToTensorOp::InferMeta(
   paddle::dialect::DenseTensorArrayType x_type;
   if (x_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     x_type = x_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    x_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -2477,14 +2313,6 @@ std::vector<pir::Type> TensorToArrayOp::InferMeta(
 
   if (x_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    x = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -2500,17 +2328,6 @@ std::vector<pir::Type> TensorToArrayOp::InferMeta(
   paddle::dialect::DenseTensorType out_grad;
   if (out_grad_.type().isa<paddle::dialect::DenseTensorType>()) {
     out_grad = out_grad_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (out_grad_.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_input =
-        out_grad_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    out_grad =
-        paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_input.dtype(),
-                                              allocated_input.dims(),
-                                              allocated_input.data_layout(),
-                                              allocated_input.lod(),
-                                              allocated_input.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -2651,19 +2468,6 @@ phi::IntArray CalcSliceBoundsFromValue(pir::Value starts_or_ends) {
     starts_or_ends_list =
         std::move(phi::IntArray(std::vector<int64_t>(starts_or_ends_size, -1)));
     starts_or_ends_list.SetFromTensor(true);
-  } else if (starts_or_ends.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    common::DDim starts_or_ends_dim =
-        starts_or_ends.type()
-            .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-            .dims();
-    size_t starts_or_ends_size = common::product(starts_or_ends_dim);
-    if (common::contain_unknown_dim(starts_or_ends_dim)) {
-      starts_or_ends_size = 1;
-    }
-    starts_or_ends_list =
-        std::move(phi::IntArray(std::vector<int64_t>(starts_or_ends_size, -1)));
-    starts_or_ends_list.SetFromTensor(true);
   } else {
     PADDLE_THROW(
         phi::errors::Unimplemented("Only support VectorType or DenseTensorType "
@@ -2710,15 +2514,6 @@ std::vector<pir::Type> SliceArrayOp::InferMeta(
   paddle::dialect::DenseTensorArrayType input_type;
   if (input.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     input_type = input.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (input.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        input.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    input_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::AllocatedDenseTensorArrayType or "
@@ -2869,15 +2664,6 @@ std::vector<pir::Type> SliceArrayDenseOp::InferMeta(
   paddle::dialect::DenseTensorArrayType input_type;
   if (input.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     input_type = input.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (input.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        input.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    input_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -3016,14 +2802,6 @@ std::vector<pir::Type> AssignArrayOp::InferMeta(
   paddle::dialect::DenseTensorArrayType x_type;
   if (x_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     x_type = x_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    x_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -3125,14 +2903,6 @@ std::vector<pir::Type> AssignArray_Op::InferMeta(
   paddle::dialect::DenseTensorArrayType x_type;
   if (x_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     x_type = x_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    x_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -3401,15 +3171,6 @@ std::vector<pir::Type> ExpandOp::InferMeta(
   paddle::dialect::DenseTensorType x;
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    x = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_input.dtype(),
-                                              allocated_input.dims(),
-                                              allocated_input.data_layout(),
-                                              allocated_input.lod(),
-                                              allocated_input.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -3457,17 +3218,6 @@ std::vector<pir::Type> ExpandOp::InferMeta(
       }
       vec_shape = std::vector<int64_t>(shape_size, -2);
       *is_from_tensor = true;
-    } else if (shape.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-      common::DDim shape_dim =
-          shape.type()
-              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-              .dims();
-      size_t shape_size = common::product(shape_dim);
-      if (common::contain_unknown_dim(shape_dim)) {
-        shape_size = 1;
-      }
-      vec_shape = std::vector<int64_t>(shape_size, -2);
-      *is_from_tensor = true;
     } else {
       PADDLE_THROW(phi::errors::Unimplemented(
           "Only support VectorType or DenseTensorType "
@@ -3646,15 +3396,6 @@ std::vector<pir::Type> IncrementOp::InferMeta(
   paddle::dialect::DenseTensorType x;
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    x = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_input.dtype(),
-                                              allocated_input.dims(),
-                                              allocated_input.data_layout(),
-                                              allocated_input.lod(),
-                                              allocated_input.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -3836,15 +3577,6 @@ std::vector<pir::Type> Increment_Op::InferMeta(
   paddle::dialect::DenseTensorType x;
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    x = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_input.dtype(),
-                                              allocated_input.dims(),
-                                              allocated_input.data_layout(),
-                                              allocated_input.lod(),
-                                              allocated_input.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -4001,15 +3733,6 @@ std::vector<pir::Type> AssignOut_Op::InferMeta(
   paddle::dialect::DenseTensorType x;
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_x =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    x = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_x.dtype(),
-                                              allocated_x.dims(),
-                                              allocated_x.data_layout(),
-                                              allocated_x.lod(),
-                                              allocated_x.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -4090,15 +3813,6 @@ std::vector<pir::Type> ShapeBroadcastOp::InferMeta(
   paddle::dialect::DenseTensorType x;
   if (x_.type().isa<paddle::dialect::DenseTensorType>()) {
     x = x_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_x =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    x = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_x.dtype(),
-                                              allocated_x.dims(),
-                                              allocated_x.data_layout(),
-                                              allocated_x.lod(),
-                                              allocated_x.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -4108,15 +3822,6 @@ std::vector<pir::Type> ShapeBroadcastOp::InferMeta(
   paddle::dialect::DenseTensorType y;
   if (y_.type().isa<paddle::dialect::DenseTensorType>()) {
     y = y_.type().dyn_cast<paddle::dialect::DenseTensorType>();
-  } else if (y_.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    paddle::dialect::AllocatedDenseTensorType allocated_x =
-        y_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorType>();
-    y = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(),
-                                              allocated_x.dtype(),
-                                              allocated_x.dims(),
-                                              allocated_x.data_layout(),
-                                              allocated_x.lod(),
-                                              allocated_x.offset());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorType or "
@@ -4316,14 +4021,6 @@ std::vector<pir::Type> MemcpyD2hMultiIoOp::InferMeta(
   paddle::dialect::DenseTensorArrayType x_type;
   if (x_.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     x_type = x_.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (x_.type().isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        x_.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    x_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
@@ -4472,15 +4169,6 @@ std::vector<pir::Type> ArrayPopOp::InferMeta(
   paddle::dialect::DenseTensorArrayType input_type;
   if (input.type().isa<paddle::dialect::DenseTensorArrayType>()) {
     input_type = input.type().dyn_cast<paddle::dialect::DenseTensorArrayType>();
-  } else if (input.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    paddle::dialect::AllocatedDenseTensorArrayType allocated_input =
-        input.type().dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>();
-    input_type = paddle::dialect::DenseTensorArrayType::get(
-        pir::IrContext::Instance(),
-        allocated_input.dtype(),
-        allocated_input.dims(),
-        allocated_input.data_layout());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Only support paddle::dialect::DenseTensorArrayType or "
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_type.cc b/paddle/fluid/pir/dialect/operator/ir/op_type.cc
index 2765352759969..3e3902a86376e 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_type.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_type.cc
@@ -28,6 +28,26 @@ const phi::LoD& SelectedRowsType::lod() const { return storage()->lod_; }
 
 const size_t& SelectedRowsType::offset() const { return storage()->offset_; }
 
+bool SelectedRowsType::classof(Type type) {
+  if (type) {
+    if (type.type_id() == type_id()) return true;
+    if (auto wrap_type = type.dyn_cast<pir::WrapTypeInterface>()) {
+      return classof(wrap_type.prim_type());
+    }
+  }
+  return false;
+}
+
+SelectedRowsType SelectedRowsType::dyn_cast_impl(Type type) {
+  if (type) {
+    if (type.type_id() == type_id()) return SelectedRowsType(type.storage());
+    if (auto wrap_type = type.dyn_cast<pir::WrapTypeInterface>()) {
+      return dyn_cast_impl(wrap_type.prim_type());
+    }
+  }
+  return nullptr;
+}
+
 const pir::Type& DenseTensorArrayType::dtype() const {
   return storage()->dtype_;
 }
@@ -37,6 +57,27 @@ const phi::DataLayout& DenseTensorArrayType::data_layout() const {
   return storage()->layout_;
 }
 
+bool DenseTensorArrayType::classof(Type type) {
+  if (type) {
+    if (type.type_id() == type_id()) return true;
+    if (auto wrap_type = type.dyn_cast<pir::WrapTypeInterface>()) {
+      return classof(wrap_type.prim_type());
+    }
+  }
+  return false;
+}
+
+DenseTensorArrayType DenseTensorArrayType::dyn_cast_impl(Type type) {
+  if (type) {
+    if (type.type_id() == type_id())
+      return DenseTensorArrayType(type.storage());
+    if (auto wrap_type = type.dyn_cast<pir::WrapTypeInterface>()) {
+      return dyn_cast_impl(wrap_type.prim_type());
+    }
+  }
+  return nullptr;
+}
+
 }  // namespace dialect
 }  // namespace paddle
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_type.h b/paddle/fluid/pir/dialect/operator/ir/op_type.h
index b06940d5b34d7..4cc68b6d9fd7a 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_type.h
+++ b/paddle/fluid/pir/dialect/operator/ir/op_type.h
@@ -42,6 +42,14 @@ class TEST_API SelectedRowsType
   const phi::LoD &lod() const;
 
   const size_t &offset() const;
+
+  ///
+  /// \brief Implementation of 'classof' that compares the type id of
+  /// the provided value with the concrete type id.
+  ///
+  static bool classof(Type type);
+
+  static SelectedRowsType dyn_cast_impl(Type type);
 };
 
 class DenseTensorArrayType
@@ -56,6 +64,14 @@ class DenseTensorArrayType
   const phi::DDim &dims() const;
 
   const phi::DataLayout &data_layout() const;
+
+  ///
+  /// \brief Implementation of 'classof' that compares the type id of
+  /// the provided value with the concrete type id.
+  ///
+  static bool classof(Type type);
+
+  static DenseTensorArrayType dyn_cast_impl(Type type);
 };
 
 }  // namespace dialect
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index cca683ed0bbef..9a9df1fed3cdd 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -330,16 +330,6 @@ phi::DataType GetValueDataType(const pir::Type& type) {
     } else {
       return phi::DataType::UNDEFINED;
     }
-  } else if (type.isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    return dialect::TransToPhiDataType(
-        type.dyn_cast<paddle::dialect::AllocatedDenseTensorType>().dtype());
-  } else if (type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
-    return dialect::TransToPhiDataType(
-        type.dyn_cast<paddle::dialect::AllocatedSelectedRowsType>().dtype());
-  } else if (type.isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    return dialect::TransToPhiDataType(
-        type.dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>()
-            .dtype());
   } else {
     PADDLE_THROW(
         phi::errors::InvalidType("Currently, we can only get dtype for "
@@ -351,43 +341,7 @@ phi::DataType GetValueDataType(const pir::Value& value) {
   if (value.impl() == nullptr) {
     return phi::DataType::UNDEFINED;
   }
-  if (value.type().isa<pir::DenseTensorType>()) {
-    return dialect::TransToPhiDataType(
-        value.type().dyn_cast<pir::DenseTensorType>().dtype());
-  } else if (value.type().isa<paddle::dialect::SelectedRowsType>()) {
-    return dialect::TransToPhiDataType(
-        value.type().dyn_cast<paddle::dialect::SelectedRowsType>().dtype());
-  } else if (value.type().isa<DenseTensorArrayType>()) {
-    return dialect::TransToPhiDataType(
-        value.type().dyn_cast<DenseTensorArrayType>().dtype());
-  } else if (value.type().isa<pir::VectorType>()) {
-    auto vec_value = value.type().dyn_cast<pir::VectorType>();
-    if (vec_value.size() > 0) {
-      return GetValueDataType(vec_value[0]);
-    } else {
-      return phi::DataType::UNDEFINED;
-    }
-  } else if (value.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    return dialect::TransToPhiDataType(
-        value.type()
-            .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-            .dtype());
-  } else if (value.type().isa<paddle::dialect::AllocatedSelectedRowsType>()) {
-    return dialect::TransToPhiDataType(
-        value.type()
-            .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
-            .dtype());
-  } else if (value.type()
-                 .isa<paddle::dialect::AllocatedDenseTensorArrayType>()) {
-    return dialect::TransToPhiDataType(
-        value.type()
-            .dyn_cast<paddle::dialect::AllocatedDenseTensorArrayType>()
-            .dtype());
-  } else {
-    PADDLE_THROW(
-        phi::errors::InvalidType("Currently, we can only get dtype for "
-                                 "DenseTensorType and SelectedRowsType."));
-  }
+  return GetValueDataType(value.type());
 }
 
 void DoValueCheck(const pir::Value& value,
@@ -519,17 +473,6 @@ std::vector<int64_t> ParseValueShape(const pir::Value& shape,
     }
     vec_shape = std::vector<int64_t>(shape_size, -1);
     *is_from_tensor = true;
-  } else if (shape.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-    common::DDim shape_dim =
-        shape.type()
-            .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-            .dims();
-    size_t shape_size = common::product(shape_dim);
-    if (common::contain_unknown_dim(shape_dim)) {
-      shape_size = 1;
-    }
-    vec_shape = std::vector<int64_t>(shape_size, -1);
-    *is_from_tensor = true;
   } else {
     PADDLE_THROW(
         phi::errors::Unimplemented("Only support VectorType or DenseTensorType "
diff --git a/paddle/pir/src/core/builtin_type.cc b/paddle/pir/src/core/builtin_type.cc
index 96b83c8f6fe58..6a1f5f9b26fd6 100644
--- a/paddle/pir/src/core/builtin_type.cc
+++ b/paddle/pir/src/core/builtin_type.cc
@@ -30,6 +30,7 @@ const DenseTensorType::LoD& DenseTensorType::lod() const {
 }
 
 size_t DenseTensorType::offset() const { return storage()->offset_; }
+
 bool DenseTensorType::classof(Type type) {
   if (type) {
     if (type.type_id() == type_id()) return true;
@@ -39,6 +40,7 @@ bool DenseTensorType::classof(Type type) {
   }
   return false;
 }
+
 DenseTensorType DenseTensorType::dyn_cast_impl(Type type) {
   if (type) {
     if (type.type_id() == type_id()) return DenseTensorType(type.storage());

From ed3486b0b9159cf5d448af4ac6c254b1d0e905d3 Mon Sep 17 00:00:00 2001
From: GGBond8488 <33050871+GGBond8488@users.noreply.github.com>
Date: Wed, 6 Mar 2024 21:05:38 +0800
Subject: [PATCH 029/114] Support n-order differential testing (#62074)

* init

* fix some typro

* opt

* add full jacbian test mode

* remove dyn numerical jvp

* msg fix

* msg fix

* fix unused

* add TODO

* fix

* fix

* rm ano
---
 test/legacy_test/autograd_checker_helper.py | 358 ++++++++++++++++++++
 1 file changed, 358 insertions(+)
 create mode 100644 test/legacy_test/autograd_checker_helper.py

diff --git a/test/legacy_test/autograd_checker_helper.py b/test/legacy_test/autograd_checker_helper.py
new file mode 100644
index 0000000000000..e51f40beb1976
--- /dev/null
+++ b/test/legacy_test/autograd_checker_helper.py
@@ -0,0 +1,358 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Sequence
+from logging import warning
+
+import numpy as np
+
+import paddle
+from paddle import base
+from paddle.autograd.backward_utils import ValueDict
+from paddle.base import core
+from paddle.base.backward import _as_list
+
+__all__ = ['check_vjp']
+
+EPS = 1e-4
+
+default_gradient_tolerance = {
+    np.float16: 1e-2,
+    np.float32: 2e-3,
+    np.float64: 1e-5,
+    np.complex64: 1e-3,
+    np.complex128: 1e-5,
+}
+
+
+def _product(t):
+    return int(np.prod(t))
+
+
+def make_jacobian(x, y_size, np_dtype):
+    if isinstance(x, (base.framework.Variable, paddle.pir.Value)):
+        return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
+    elif isinstance(x, Sequence):
+        jacobians = list(
+            filter(
+                lambda t: t is not None,
+                (make_jacobian(item, y_size, np_dtype) for item in x),
+            )
+        )
+        return jacobians
+    else:
+        pass
+
+
+def compute_numerical_jacobian(program, inputs, outputs, feeds, eps):
+    paddle.enable_static()
+    numerical = []
+    for input in inputs:
+        numerical.append(
+            _compute_numerical_jacobian(program, input, outputs, feeds, eps)
+        )
+    paddle.disable_static()
+    return numerical
+
+
+def _compute_numerical_jacobian(program, x, y, feeds, eps):
+    if not isinstance(x, paddle.pir.Value):
+        raise TypeError('x is not Value')
+
+    # To compute the jacobian, treat x and y as one-dimensional vectors.
+    y = _as_list(y)
+    exe = paddle.static.Executor()
+
+    def run():
+        res = exe.run(program, feeds, fetch_list=[y])
+        y_res = res[: len(y)]
+        return [yi.flatten() for yi in y_res]
+
+    x_name = x.get_defining_op().attrs()['name']
+    x_shape = x.shape
+    x_size = _product(x_shape)
+    np_type = dtype_to_np_dtype(x.dtype)
+    np_t = np.array(feeds[x_name]).astype(np_type)
+    np_t = np_t.flatten()
+    jacobian = [make_jacobian(x, _product(yi.shape), np_type) for yi in y]
+
+    for i in range(x_size):
+        orig = np_t[i]
+        x_pos = orig + eps
+        np_t[i] = x_pos
+        np_f = np_t.reshape(x_shape)
+        feeds[x_name] = np_f
+        y_pos = run()
+
+        x_neg = orig - eps
+        np_t[i] = x_neg
+        np_f = np_t.reshape(x_shape)
+        feeds[x_name] = np_f
+        y_neg = run()
+
+        np_t[i] = orig
+        for j in range(len(y)):
+            ret = (y_pos[j] - y_neg[j]) / eps / 2.0
+            jacobian[j][i, :] = ret
+
+    return jacobian
+
+
+def compute_analytical_jacobian(
+    program, inputs, outputs, last_grads_in, feeds, fetch_list
+):
+    paddle.enable_static()
+    analytical = []
+    for i in range(len(outputs)):
+        name = last_grads_in[i].name
+        feeds.update(
+            {
+                name: np.zeros(
+                    outputs[i].shape, dtype=dtype_to_np_dtype(outputs[i].dtype)
+                )
+            }
+        )
+    for i in range(len(outputs)):
+        analytical.append(
+            _compute_analytical_jacobian(
+                program,
+                inputs,
+                i,
+                outputs,
+                fetch_list,
+                feeds,
+                last_grads_in[i].name,
+            )
+        )
+    paddle.disable_static()
+    return analytical
+
+
+def _compute_analytical_jacobian(program, x, i, y, grads, feeds, name):
+    if not isinstance(x, (list, paddle.pir.Value)):
+        raise TypeError('x is not Value or list of Value')
+    np_type = dtype_to_np_dtype(y[i].dtype)
+    exe = paddle.static.Executor()
+    y_size = _product(y[i].shape)
+    x = _as_list(x)
+    jacobian = make_jacobian(x, y_size, np_type)
+
+    # get the name in feeds of dyi
+    np_t = np.array(feeds[name]).astype(np_type)
+    shape = np_t.shape
+    np_t = np_t.flatten()
+    for i in range(y_size):
+        np_t[i] = 1
+        np_f = np_t.reshape(shape)
+        feeds[name] = np_f
+        res = exe.run(program, feed=feeds, fetch_list=[grads])
+        dx_res = res[: len(grads)]
+        for j in range(len(grads)):
+            if dx_res[j] is not None:
+                jacobian[j][:, i] = dx_res[j].flatten()
+            else:
+                jacobian[j][:, i] = np.zeros(
+                    grads[j].shape, dtype=np_type
+                ).flatten()
+
+        np_t[i] = 0
+        np_f = np_t.reshape(shape)
+        feeds[name] = np_f
+
+    return jacobian
+
+
+def dtype_to_np_dtype(dtype):
+    if dtype == core.VarDesc.VarType.FP32 or dtype == core.DataType.FLOAT32:
+        return np.float32
+    elif dtype == core.VarDesc.VarType.FP64 or dtype == core.DataType.FLOAT64:
+        return np.float64
+    elif dtype == core.VarDesc.VarType.FP16 or dtype == core.DataType.FLOAT16:
+        return np.float16
+    else:
+        raise ValueError("Not supported data type " + str(dtype))
+
+
+def get_eager_vjp(func, inputs, cotangents=None, order=1):
+    for x in inputs:
+        x.stop_gradient = False
+    outputs = func(inputs)
+    return _get_eager_vjp(inputs, outputs, cotangents, order)
+
+
+def _get_eager_vjp(inputs, outputs, tangents, order):
+    if order > 1:
+        create_graph = True
+    else:
+        create_graph = False
+
+    d_inputs = paddle.grad(
+        outputs=outputs,
+        inputs=inputs,
+        grad_outputs=tangents,
+        create_graph=create_graph,
+        allow_unused=True,
+    )
+    d_inputs = [d_input for d_input in d_inputs if d_input is not None]
+    if order > 1:
+        ddys = []
+        for d_input in d_inputs:
+            d_input.stop_gradient = False
+            ddy = paddle.ones(shape=d_input.shape, dtype=d_input.dtype)
+            ddy.stop_gradient = False
+            ddys.append(ddy)
+        return _get_eager_vjp(inputs, d_inputs, ddys, order - 1)
+
+    return d_inputs
+
+
+def get_static_vjp(program, feeds, fetch):
+    paddle.enable_static()
+    exe = paddle.static.Executor()
+    res = exe.run(program, feed=feeds, fetch_list=[fetch])
+    paddle.disable_static()
+    return res
+
+
+def get_static_vjp_program(func, inputs, order):
+    cotangents = []
+    paddle.enable_static()
+    input_vars = []
+    feeds = {}
+    for idx, input in enumerate(inputs):
+        np_type = dtype_to_np_dtype(input.dtype)
+        input_var = paddle.static.data(
+            'input_' + str(idx), input.shape, dtype=np_type
+        )
+        input_vars.append(input_var)
+        feeds.update({'input_' + str(idx): input.numpy()})
+    outputs = func(input_vars)
+    outputs = _as_list(outputs)
+    # TODO(GGBond8488): Need to be fixed when paddle uses pir by default.
+    program, (keys, values) = paddle.base.libpaddle.pir.clone_program(
+        paddle.static.default_main_program()
+    )
+    op_map = ValueDict()
+    for key, value in zip(keys, values):
+        op_map[key] = value
+    pir_inputs = []
+    for input in input_vars:
+        pir_inputs.append(op_map[input])
+    pir_outputs = []
+    grads_in_init = []
+    with paddle.static.program_guard(program):
+        # Make sure the grad_in_var is in the program
+        for idx, output in enumerate(outputs):
+            pir_outputs.append(op_map[output])
+            np_type = dtype_to_np_dtype(input.dtype)
+            grad_in_var = paddle.static.data(
+                'grad_in_' + str(idx), output.shape, dtype=np_type
+            )
+            grads_in_init.append(grad_in_var)
+            grad_in_np = np.random.random(size=output.shape).astype(np_type)
+            feeds.update({'grad_in_' + str(idx): grad_in_np})
+            cotangents.append(grad_in_np)
+        feeds, pre_outputs, d_inputs, last_grads_in = _get_static_vjp_program(
+            pir_inputs, pir_outputs, feeds, grads_in_init, order
+        )
+    if not d_inputs:
+        warning(f"{func.__name__} {order}s grad will return None")
+    paddle.disable_static()
+    return program, pir_inputs, d_inputs, pre_outputs, feeds, cotangents
+
+
+def _get_static_vjp_program(inputs, outputs, feeds, grads_in, order):
+    def _require_grads(vars):
+        for var in vars:
+            var.stop_gradient = False
+            var.persistable = True
+
+    inputs = _as_list(inputs)
+    outputs = _as_list(outputs)
+    _require_grads(inputs)
+    _require_grads(outputs)
+    _require_grads(grads_in)
+    d_inputs = paddle.base.gradients(outputs, inputs, grads_in)
+    d_inputs = [d_input for d_input in d_inputs if d_input is not None]
+    _require_grads(d_inputs)
+
+    if order > 1:
+        ddys = []
+        for idx, d_input in enumerate(d_inputs):
+            np_type = dtype_to_np_dtype(d_input.dtype)
+            ddy = paddle.static.data(
+                name=f'dy_{idx}_{order}',
+                shape=d_input.shape,
+                dtype=np_type,
+            )
+            ones = np.ones(d_input.shape, dtype=np_type)
+            feeds.update({f'dy_{idx}_{order}': ones})
+            ddys.append(ddy)
+        _require_grads(ddys)
+        return _get_static_vjp_program(inputs, d_inputs, feeds, ddys, order - 1)
+    return feeds, outputs, d_inputs, grads_in
+
+
+def check_vjp(func, args, order=2, atol=None, rtol=None, eps=EPS):
+    args = _as_list(args)
+    np_type = dtype_to_np_dtype(args[0].dtype)
+    atol = atol if atol else default_gradient_tolerance[np_type]
+    rtol = rtol if rtol else default_gradient_tolerance[np_type]
+
+    (
+        program,
+        inputs,
+        fetch_list,
+        outputs,
+        feeds,
+        cotangents,
+    ) = get_static_vjp_program(func, args, order)
+    numeric_jacobian = compute_numerical_jacobian(
+        program, inputs, outputs, feeds, eps
+    )
+    cotangents = list(map(paddle.to_tensor, cotangents))
+    eager_vjps = get_eager_vjp(func, args, cotangents, order)
+    static_vjps_np = get_static_vjp(program, feeds, fetch_list)
+    eager_vjps_np = []
+    for eager_vjp in eager_vjps:
+        eager_vjps_np.append(eager_vjp.numpy())
+    inputs_length = len(numeric_jacobian)
+    numeric_vjps = []
+    for x_idx in range(inputs_length):
+        jacobians = _as_list(numeric_jacobian[x_idx])
+        dx_idx = None
+        v = np.ones(static_vjps_np[x_idx].shape).astype(np_type).flatten()
+        for y_idx in range(len(jacobians)):
+            if dx_idx is None:
+                dx_idx = np.dot(v, jacobians[y_idx])
+            else:
+                dx_idx += np.dot(v, jacobians[y_idx])
+        numeric_vjps.append(dx_idx)
+    eager_vjps_np = list(map(np.ndarray.flatten, eager_vjps_np))
+    static_vjps_np = list(map(np.ndarray.flatten, static_vjps_np))
+
+    np.testing.assert_allclose(
+        numeric_vjps,
+        eager_vjps_np,
+        atol=atol,
+        rtol=rtol,
+        err_msg="eager vjps is not close to numeric vjps",
+    )
+    np.testing.assert_allclose(
+        numeric_vjps,
+        static_vjps_np,
+        atol=atol,
+        rtol=rtol,
+        err_msg="static vjps is not close to numeric vjps",
+    )

From 0c43da7467418348e5f880a35a358dff618f1322 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Wed, 6 Mar 2024 21:14:46 +0800
Subject: [PATCH 030/114] [DistDialect] Add PIR Pybind Utils for Auto-Parallel
 (#62297)

* [PIR] add distributed dialect.

* update utils for distdensetensor

* param network

* update api

* add unitest

* bugfix

* update unitest

* adopt for new api name

* update cmake

* adapt for gshape construct

* adapt for gshape construct

* new func

---------

Co-authored-by: winter-wang <1030748926@qq.com>
---
 paddle/fluid/pir/dialect/CMakeLists.txt       |   6 +-
 paddle/fluid/pybind/pir.cc                    | 107 +++++++-
 .../paddle/distributed/auto_parallel/api.py   |  45 +++-
 python/paddle/pir/__init__.py                 |   1 +
 python/paddle/pir_utils.py                    |   2 +
 .../test_tensor_attr_consistency.py           |   4 +-
 test/ir/pir/test_ir_dist_attr.py              | 245 ++++++++++++++++++
 7 files changed, 391 insertions(+), 19 deletions(-)
 create mode 100644 test/ir/pir/test_ir_dist_attr.py

diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
index d5050b49ac582..b0606b59b28f8 100644
--- a/paddle/fluid/pir/dialect/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -258,9 +258,9 @@ endif()
 file(GLOB_RECURSE dist_dialect_srcs
      "${CMAKE_CURRENT_SOURCE_DIR}/distributed/ir/*.cc")
 
-if(WITH_DISTRIBUTE)
-  set(op_dialect_srcs ${op_dialect_srcs} ${dist_dialect_srcs})
-endif()
+# if(WITH_DISTRIBUTE) FIXME in next PR
+set(op_dialect_srcs ${op_dialect_srcs} ${dist_dialect_srcs})
+# endif()
 set(op_dialect_deps phi common pir type_info string_helper)
 
 cc_library(
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index d28b274348201..b76e23fe53eef 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -23,11 +23,15 @@
 #include <unordered_set>
 #include <utility>
 
+#include "paddle/common/flags.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
@@ -62,6 +66,7 @@
 #include "paddle/fluid/pybind/control_flow_api.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/pybind_variant_caster.h"
+#include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/pir/include/core/attribute.h"
 #include "paddle/pir/include/core/block.h"
@@ -78,8 +83,6 @@
 #include "paddle/pir/include/pass/pass.h"
 #include "paddle/pir/include/pass/pass_manager.h"
 #include "paddle/pir/include/pass/pass_registry.h"
-
-#include "paddle/common/flags.h"
 #include "pybind11/stl.h"
 
 #ifdef PADDLE_WITH_CINN
@@ -96,6 +99,7 @@ namespace py = pybind11;
 using paddle::dialect::ApiBuilder;
 using paddle::dialect::DenseTensorArrayType;
 using paddle::dialect::DenseTensorType;
+using paddle::dialect::DistDenseTensorType;
 using paddle::dialect::IfOp;
 using paddle::dialect::PyLayerOp;
 using paddle::dialect::SelectedRowsType;
@@ -631,10 +635,13 @@ phi::DataType GetValueDtype(Value value) {
   } else if (value.type().isa<DenseTensorArrayType>()) {
     return paddle::dialect::TransToPhiDataType(
         value.type().dyn_cast<DenseTensorArrayType>().dtype());
+  } else if (value.type().isa<DistDenseTensorType>()) {
+    return paddle::dialect::TransToPhiDataType(
+        value.type().dyn_cast<DistDenseTensorType>().dtype());
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "Currently, we can only get phi::DataType from DenseTensorType and "
-        "SelectedRowsType."));
+        "SelectedRowsType, DistDenseTensorType."));
   }
 }
 
@@ -646,9 +653,11 @@ const phi::DDim &GetValueDims(Value value) {
     return value.type().dyn_cast<DenseTensorType>().dims();
   } else if (value.type().isa<SelectedRowsType>()) {
     return value.type().dyn_cast<SelectedRowsType>().dims();
+  } else if (value.type().isa<DistDenseTensorType>()) {
+    return value.type().dyn_cast<DistDenseTensorType>().global_ddim();
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
-        "Currently, we can only get shape for dense "
+        "Currently, we can only get shape for dense and distdense"
         "tensor."));
   }
 }
@@ -749,6 +758,20 @@ void BindValue(py::module *m) {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "can't set shape when building static graph"));
           })
+      .def_property(
+          "_local_shape",
+          [](Value self) {
+            if (!self.type().isa<DistDenseTensorType>()) {
+              PADDLE_THROW(phi::errors::InvalidArgument(
+                  "_local_shape is only for distdense tensor."));
+            }
+            return phi::vectorize(
+                self.type().dyn_cast<DistDenseTensorType>().local_ddim());
+          },
+          [](Value self, const std::vector<int> &shape) {
+            PADDLE_THROW(phi::errors::InvalidArgument(
+                "can't set _local_shape when building static graph"));
+          })
       .def_property(
           "dtype",
           [](Value self) { return GetValueDtype(self); },
@@ -808,6 +831,8 @@ void BindValue(py::module *m) {
            [](Value self) { return self.type().isa<SelectedRowsType>(); })
       .def("is_dense_tensor_array_type",
            [](Value self) { return self.type().isa<DenseTensorArrayType>(); })
+      .def("is_dist_dense_tensor_type",
+           [](Value self) { return self.type().isa<DistDenseTensorType>(); })
       .def("replace_all_uses_with",
            [](Value self, Value value) { self.ReplaceAllUsesWith(value); })
       .def("set_type", [](Value self, Type type) { self.set_type(type); })
@@ -829,7 +854,52 @@ void BindValue(py::module *m) {
                  BoolAttribute::get(pir::IrContext::Instance(), true));
              return out;
            })
-      .def("__repr__", &Value2String);
+      .def("__repr__", &Value2String)
+      .def_property(
+          "dims_mapping",
+          [](Value self) {
+            if (!self.type().isa<DistDenseTensorType>()) {
+              PADDLE_THROW(phi::errors::InvalidArgument(
+                  "dims_mapping is only for distdense tensor."));
+            }
+            return self.type().dyn_cast<DistDenseTensorType>().dims_mapping();
+          },
+          [](Value self, const std::vector<int> &shape) {
+            PADDLE_THROW(phi::errors::InvalidArgument(
+                "set dims_mapping when building static graph is un-supported "
+                "now."));
+          })
+      .def_property(
+          "partial_dims",
+          [](Value self) {
+            if (!self.type().isa<DistDenseTensorType>()) {
+              PADDLE_THROW(phi::errors::InvalidArgument(
+                  "partial_dims is only for distdense tensor."));
+            }
+            return self.type().dyn_cast<DistDenseTensorType>().partial_dims();
+          },
+          [](Value self, const std::vector<int> &shape) {
+            PADDLE_THROW(phi::errors::InvalidArgument(
+                "set partial_dims when building static graph is un-supported "
+                "now."));
+          })
+      .def_property(
+          "process_mesh",
+          [](Value self) {
+            if (!self.type().isa<DistDenseTensorType>()) {
+              PADDLE_THROW(phi::errors::InvalidArgument(
+                  "process_mesh is only for distdense tensor."));
+            }
+            return self.type()
+                .dyn_cast<DistDenseTensorType>()
+                .process_mesh_attr()
+                .process_mesh();
+          },
+          [](Value self, const std::vector<int> &shape) {
+            PADDLE_THROW(phi::errors::InvalidArgument(
+                "set process_mesh when building static graph is un-supported "
+                "now."));
+          });
 }
 
 void BindOpOperand(py::module *m) {
@@ -1329,6 +1399,27 @@ pir::Type CreateSelectedRowsTypeByDenseTensor(pir::Type dense_tensor_type) {
   }
 }
 
+pir::Type CreateDistDenseTensorTypeByDenseTensor(
+    const pir::Type &gdense_tensor_type,
+    const std::vector<int> &lshape,
+    const phi::distributed::ProcessMesh &mesh,
+    const std::vector<int64_t> &dims_mapping) {
+  if (gdense_tensor_type.isa<DenseTensorType>()) {
+    DenseTensorType type = gdense_tensor_type.dyn_cast<DenseTensorType>();
+    paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
+    paddle::dialect::TensorDistAttribute tensor_dist_attr =
+        paddle::dialect::TensorDistAttribute::get(
+            pir::IrContext::Instance(), mesh, dims_mapping, partial_status);
+    return DistDenseTensorType::get(pir::IrContext::Instance(),
+                                    type,
+                                    tensor_dist_attr,
+                                    phi::make_ddim(lshape));
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Currently, input is not a dense tensor type are not supported."));
+  }
+}
+
 void ResetShadowOutputName(pir::Operation *op, const std::string &name) {
   pir::IrContext *ctx = pir::IrContext::Instance();
   if (op->isa<pir::ShadowOutputOp>()) {
@@ -1396,8 +1487,14 @@ void BindUtils(pybind11::module *m) {
     pir::IrContext::Instance()
         ->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
   });
+  m->def("register_dist_dialect", []() {
+    pir::IrContext::Instance()
+        ->GetOrRegisterDialect<paddle::dialect::DistDialect>();
+  });
   m->def("create_selected_rows_type_by_dense_tensor",
          CreateSelectedRowsTypeByDenseTensor);
+  m->def("create_dist_dense_tensor_type_by_dense_tensor",
+         CreateDistDenseTensorTypeByDenseTensor);
   m->def(
       "translate_to_pir",
       [](const ::paddle::framework::ProgramDesc &legacy_program) {
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index 45eb7c8c2491c..ada2958cdc57c 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -20,7 +20,7 @@
 
 import paddle
 import paddle.distributed as dist
-from paddle import _C_ops, nn
+from paddle import _C_ops, nn, pir
 from paddle.amp.grad_scaler import OptimizerState
 from paddle.base import unique_name
 from paddle.base.dygraph.base import switch_to_static_graph
@@ -255,16 +255,41 @@ def dtensor_from_local(local_tensor, mesh, placements):
             local_dim_size = global_dims[shard_dim]
             global_dims[shard_dim] = local_dim_size * mesh.shape[idx]
 
-    place = paddle.framework._current_expected_place()
-    place = paddle.framework._get_paddle_place(place)
+    if paddle.in_dynamic_mode():
+        place = paddle.framework._current_expected_place()
+        place = paddle.framework._get_paddle_place(place)
+
+        return paddle.Tensor(
+            local_tensor,
+            dims=global_dims,
+            process_mesh=mesh,
+            placements=placements,
+            place=place,
+        )
 
-    return paddle.Tensor(
-        local_tensor,
-        dims=global_dims,
-        process_mesh=mesh,
-        placements=placements,
-        place=place,
-    )
+    # TODO Adopt Mix2Dist Pass to allow the program could be executed actually.
+    elif paddle.framework.in_pir_mode():
+        assert isinstance(
+            local_tensor, (type(None), pir.Value)
+        ), "input tensor is not pir value."
+        assert (
+            local_tensor.is_dense_tensor_type()
+        ), "dtensor_from_local() are only supported dense tensor type right."
+        sharding_specs = get_shard_spec(mesh, placements, local_tensor.ndim)
+        dims_mapping = convert_to_dims_mapping(sharding_specs, mesh)
+        local_shape = local_tensor.shape
+        global_tensor_type = paddle.pir.create_shaped_type(
+            local_tensor.type(), global_dims
+        )
+        dist_dense_tensor_type = paddle.base.libpaddle.pir.create_dist_dense_tensor_type_by_dense_tensor(
+            global_tensor_type, local_shape, mesh, dims_mapping
+        )
+        local_tensor.set_type(dist_dense_tensor_type)
+        return local_tensor
+    else:
+        raise RuntimeError(
+            "dtensor_from_local() are only supported in dynamic or pir mode."
+        )
 
 
 def dtensor_from_fn(fn, mesh, placements, *args, **kwargs):
diff --git a/python/paddle/pir/__init__.py b/python/paddle/pir/__init__.py
index f55c5205f8c0c..7191088d80750 100644
--- a/python/paddle/pir/__init__.py
+++ b/python/paddle/pir/__init__.py
@@ -26,6 +26,7 @@
     get_current_insertion_point,
     is_fake_value,
     parse_program,
+    register_dist_dialect,
     register_paddle_dialect,
     reset_insertion_point_to_end,
     reset_insertion_point_to_start,
diff --git a/python/paddle/pir_utils.py b/python/paddle/pir_utils.py
index 601b4d27688fa..e52837889d71f 100644
--- a/python/paddle/pir_utils.py
+++ b/python/paddle/pir_utils.py
@@ -64,6 +64,8 @@ def _switch_to_pir(self):
         ]:
             paddle.framework.set_flags({"FLAGS_enable_pir_in_executor": True})
             paddle.pir.register_paddle_dialect()
+            # TODO find a better place to init the registion of dist dialect.
+            paddle.pir.register_dist_dialect()
 
             paddle.base.Program = paddle.pir.Program
             paddle.base.program_guard = paddle.pir.core.program_guard
diff --git a/test/dygraph_to_static/test_tensor_attr_consistency.py b/test/dygraph_to_static/test_tensor_attr_consistency.py
index dfb58c3f2a081..530448de75653 100644
--- a/test/dygraph_to_static/test_tensor_attr_consistency.py
+++ b/test/dygraph_to_static/test_tensor_attr_consistency.py
@@ -66,7 +66,6 @@
         'offset',
         'pin_memory',
         'placements',
-        'process_mesh',
         'reconstruct_from_',
         'register_hook',
         'retain_grads',
@@ -105,6 +104,9 @@
         'set_shape',
         'set_type',
         'use_empty',
+        'is_dist_dense_tensor_type',
+        'dims_mapping',  # TODO Unify as Placement
+        'partial_dims',  # TODO Unify as Placement
     ]
 )
 
diff --git a/test/ir/pir/test_ir_dist_attr.py b/test/ir/pir/test_ir_dist_attr.py
new file mode 100644
index 0000000000000..a4107199308bf
--- /dev/null
+++ b/test/ir/pir/test_ir_dist_attr.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed.auto_parallel.api import dtensor_from_local
+
+paddle.enable_static()
+
+BATCH_SIZE = 2
+SEQ_LEN = 4
+HIDDEN_SIZE = 8
+MP_SIZE = 2
+
+
+class TestBuildFakeProgram(unittest.TestCase):
+    def test_build_api(self):
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.base.Program()
+            with paddle.base.program_guard(main_program):
+                mesh = dist.ProcessMesh([0, 1], dim_names=['mp'])
+                input = paddle.static.data(
+                    name='input', shape=[BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE]
+                )
+                w0 = paddle.pir.core.create_parameter(
+                    dtype="float32",
+                    shape=[HIDDEN_SIZE, HIDDEN_SIZE],
+                    name="w0",
+                    initializer=paddle.nn.initializer.Uniform(),
+                )
+
+                # dense tensor could not access dist tensor attribute
+                with self.assertRaises(ValueError):
+                    tmp = input._local_shape
+                with self.assertRaises(ValueError):
+                    tmp = input.dims_mapping
+                with self.assertRaises(ValueError):
+                    tmp = w0.process_mesh
+                with self.assertRaises(ValueError):
+                    tmp = w0.partial_dims
+
+                dist_input = dtensor_from_local(input, mesh, [dist.Replicate()])
+                dist_w0 = dtensor_from_local(w0, mesh, [dist.Replicate()])
+
+    def test_build_replicated_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.base.Program()
+            with paddle.base.program_guard(main_program):
+                mesh = dist.ProcessMesh([0, 1], dim_names=['mp'])
+                input = paddle.static.data(
+                    name='input', shape=[BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE]
+                )
+                w0 = paddle.pir.core.create_parameter(
+                    dtype="float32",
+                    shape=[HIDDEN_SIZE, HIDDEN_SIZE],
+                    name="w0",
+                    initializer=paddle.nn.initializer.Uniform(),
+                )
+                self.assertTrue(input.is_dense_tensor_type())
+                self.assertTrue(w0.is_dense_tensor_type())
+
+                dist_input = dtensor_from_local(input, mesh, [dist.Replicate()])
+                dist_w0 = dtensor_from_local(w0, mesh, [dist.Replicate()])
+                # dist_out = paddle.matmul(dist_input, dist_w0)
+        self.assertTrue(dist_input.is_dist_dense_tensor_type())
+        self.assertTrue(dist_w0.is_dist_dense_tensor_type())
+
+        # check detail
+        self.assertTrue(dist_input.shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+        self.assertTrue(w0.shape == [HIDDEN_SIZE, HIDDEN_SIZE])
+        self.assertTrue(dist_input.shape == dist_input._local_shape)
+        self.assertTrue(w0.shape == w0._local_shape)
+        self.assertTrue(dist_input.dims_mapping == [-1, -1, -1])
+        self.assertTrue(
+            isinstance(
+                dist_input.process_mesh, paddle.base.libpaddle.ProcessMesh
+            )
+        )
+        self.assertTrue(dist_input.process_mesh.shape == [2])
+        self.assertTrue(dist_input.process_mesh.process_ids == [0, 1])
+        self.assertTrue(len(dist_input.partial_dims) == 0)
+        self.assertTrue(dist_w0.dims_mapping == [-1, -1])
+        self.assertTrue(
+            isinstance(dist_w0.process_mesh, paddle.base.libpaddle.ProcessMesh)
+        )
+        self.assertTrue(dist_w0.process_mesh.shape == [2])
+        self.assertTrue(dist_w0.process_mesh.process_ids == [0, 1])
+        self.assertTrue(len(dist_w0.partial_dims) == 0)
+
+        # matmul out
+        # self.assertTrue(dist_out.shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+        # self.assertTrue(dist_out._local_shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+        # self.assertTrue(dist_out.dims_mapping == [-1, -1])
+        # self.assertTrue(isinstance(dist_out.process_mesh, paddle.base.libpaddle.ProcessMesh))
+        # self.assertTrue(dist_out.process_mesh.shape == [2])
+        # self.assertTrue(dist_out.process_mesh.process_ids == [0, 1])
+        # self.assertTrue(len(dist_out.partial_dims) == 0)
+
+    def test_build_col_parallel_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.base.Program()
+            with paddle.base.program_guard(main_program):
+                mesh = dist.ProcessMesh([0, 1], dim_names=['mp'])
+                input = paddle.static.data(
+                    name='input', shape=[BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE]
+                )
+                w0 = paddle.pir.core.create_parameter(
+                    dtype="float32",
+                    shape=[HIDDEN_SIZE, HIDDEN_SIZE // MP_SIZE],
+                    name="w0",
+                    initializer=paddle.nn.initializer.Uniform(),
+                )
+                self.assertTrue(input.is_dense_tensor_type())
+                self.assertTrue(w0.is_dense_tensor_type())
+
+                dist_input = dtensor_from_local(input, mesh, [dist.Replicate()])
+                dist_w0 = dtensor_from_local(w0, mesh, [dist.Shard(1)])
+        self.assertTrue(dist_input.is_dist_dense_tensor_type())
+        self.assertTrue(dist_w0.is_dist_dense_tensor_type())
+
+        # check detail
+        self.assertTrue(dist_input.shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+        self.assertTrue(w0.shape == [HIDDEN_SIZE, HIDDEN_SIZE])
+        self.assertTrue(dist_input.shape == dist_input._local_shape)
+        self.assertTrue(
+            w0._local_shape == [HIDDEN_SIZE, HIDDEN_SIZE // MP_SIZE]
+        )
+        self.assertTrue(dist_input.dims_mapping == [-1, -1, -1])
+        self.assertTrue(dist_w0.dims_mapping == [-1, 0])
+        # matmul out
+        # self.assertTrue(dist_out.shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+        # self.assertTrue(dist_out._local_shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE // MP_SIZE])
+        # self.assertTrue(dist_out.dims_mapping == [-1, -1, 0])
+        # self.assertTrue(isinstance(dist_out.process_mesh, paddle.base.libpaddle.ProcessMesh))
+        # self.assertTrue(dist_out.process_mesh.shape == [2])
+        # self.assertTrue(dist_out.process_mesh.process_ids == [0, 1])
+        # self.assertTrue(len(dist_out.partial_dims) == 0)
+
+    def test_build_row_parallel_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_program = paddle.base.Program()
+            with paddle.base.program_guard(main_program):
+                mesh = dist.ProcessMesh([0, 1], dim_names=['mp'])
+                input = paddle.static.data(
+                    name='input',
+                    shape=[BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE // MP_SIZE],
+                )
+                w0 = paddle.pir.core.create_parameter(
+                    dtype="float32",
+                    shape=[HIDDEN_SIZE // MP_SIZE, HIDDEN_SIZE],
+                    name="w0",
+                    initializer=paddle.nn.initializer.Uniform(),
+                )
+                self.assertTrue(input.is_dense_tensor_type())
+                self.assertTrue(w0.is_dense_tensor_type())
+
+                dist_input = dtensor_from_local(input, mesh, [dist.Shard(2)])
+                dist_w0 = dtensor_from_local(w0, mesh, [dist.Shard(0)])
+        self.assertTrue(dist_input.is_dist_dense_tensor_type())
+        self.assertTrue(dist_w0.is_dist_dense_tensor_type())
+
+        # check detail
+        self.assertTrue(dist_input.shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+        self.assertTrue(w0.shape == [HIDDEN_SIZE, HIDDEN_SIZE])
+        self.assertTrue(
+            dist_input._local_shape
+            == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE // MP_SIZE]
+        )
+        self.assertTrue(
+            w0._local_shape == [HIDDEN_SIZE // MP_SIZE, HIDDEN_SIZE]
+        )
+        self.assertTrue(dist_input.dims_mapping == [-1, -1, 0])
+        self.assertTrue(dist_w0.dims_mapping == [0, -1])
+        # matmul out
+        # self.assertTrue(dist_out.shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+        # self.assertTrue(dist_out._local_shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+        # self.assertTrue(dist_out.dims_mapping == [-1, -1, -1])
+        # self.assertTrue(isinstance(dist_out.process_mesh, paddle.base.libpaddle.ProcessMesh))
+        # self.assertTrue(dist_out.process_mesh.shape == [2])
+        # self.assertTrue(dist_out.process_mesh.process_ids == [0, 1])
+        # self.assertTrue(len(dist_out.partial_dims) == set(0))
+
+    # def test_build_with_shard_tensor(self):
+    #     with paddle.pir_utils.IrGuard():
+    #         main_program = paddle.base.Program()
+    #         with paddle.base.program_guard(main_program):
+    #             mesh = dist.ProcessMesh([0, 1], dim_names=['mp'])
+    #             input = paddle.static.data(
+    #                 name='input',
+    #                 shape=[BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE],
+    #             )
+    #             w0 = paddle.pir.core.create_parameter(
+    #                 dtype="float32",
+    #                 shape=[HIDDEN_SIZE, HIDDEN_SIZE],
+    #                 name="w0",
+    #                 initializer=paddle.nn.initializer.Uniform(),
+    #             )
+    #             w1 = paddle.pir.core.create_parameter(
+    #                 dtype="float32",
+    #                 shape=[HIDDEN_SIZE, HIDDEN_SIZE],
+    #                 name="w0",
+    #                 initializer=paddle.nn.initializer.Uniform(),
+    #             )
+    #             self.assertTrue(input.is_dense_tensor_type())
+    #             self.assertTrue(w0.is_dense_tensor_type())
+
+    #             dist_input = dist.shard_tensor(input, mesh, [dist.Replicate()])
+    #             dist_w0 = dist.shard_tensor(w0, mesh, [dist.Shard(0)])
+    #             dist_w1 = dist.shard_tensor(w1, mesh, [dist.Shard(1)])
+    #     self.assertTrue(dist_input.is_dist_dense_tensor_type())
+    #     self.assertTrue(dist_w0.is_dist_dense_tensor_type())
+
+    #     # check global shape
+    #     self.assertTrue(dist_input.shape == [BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE])
+    #     self.assertTrue(dist_w0.shape == [HIDDEN_SIZE, HIDDEN_SIZE])
+    #     self.assertTrue(dist_w1.shape == [HIDDEN_SIZE, HIDDEN_SIZE])
+    #     # check local shape
+    #     self.assertTrue(
+    #         dist_input._local_shape == dist_input.shape
+    #     )  # replicated, local = global
+    #     self.assertTrue(
+    #         dist_w0._local_shape == [HIDDEN_SIZE // MP_SIZE, HIDDEN_SIZE]
+    #     )  # sharded, local != global, sharded by mesh size
+    #     self.assertTrue(
+    #         dist_w1._local_shape == [HIDDEN_SIZE, HIDDEN_SIZE // MP_SIZE]
+    #     )  # sharded, local != global, sharded by mesh size
+    # TODO check Dtype, layout same as densetensor
+    # TODO check dims_mapping & mesh as user annotated
+
+
+if __name__ == "__main__":
+    unittest.main()

From 1208cd3345113b21821accef9d31acd636b0f74a Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Wed, 6 Mar 2024 21:30:25 +0800
Subject: [PATCH 031/114] [PIR] Filter out attribute `op_callstack` when print
 program (#62469)

---
 paddle/pir/src/core/ir_printer.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/pir/src/core/ir_printer.cc b/paddle/pir/src/core/ir_printer.cc
index de75d6d2fc603..e2bc7757f9de4 100644
--- a/paddle/pir/src/core/ir_printer.cc
+++ b/paddle/pir/src/core/ir_printer.cc
@@ -279,6 +279,10 @@ void IrPrinter::PrintAttributeMap(Operation* op) {
   AttributeMap attributes = op->attributes();
   std::map<std::string, Attribute, std::less<>> order_attributes(
       attributes.begin(), attributes.end());
+
+  // Filter out the callstack attribute
+  order_attributes.erase("op_callstack");
+
   os << " {";
 
   pir::detail::PrintInterleave(

From b684e1ae7324cd1ac0c207ce711b690299039465 Mon Sep 17 00:00:00 2001
From: Shaopeng Ling <silver.ling@outlook.com>
Date: Thu, 7 Mar 2024 09:32:23 +0800
Subject: [PATCH 032/114] [HACKATHON 6th][CMake Optimization] use
 CMAKE_CXX_COMPILER_ID instead CMAKE_COMPILER_IS_XXX etc (#62473)

---
 cmake/external/eigen.cmake | 20 ++++++--------------
 cmake/external/gloo.cmake  | 28 ++++++++++------------------
 cmake/simd.cmake           |  4 +---
 3 files changed, 17 insertions(+), 35 deletions(-)

diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 8638d4bdc84b5..eeff1cccc570c 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -39,7 +39,7 @@ elseif(LINUX)
   endif()
 endif()
 
-if(CMAKE_COMPILER_IS_GNUCC)
+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
   file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorRandom.h.patch
        tensor_random_header)
   # See: [Why calling some `git` commands before `patch`?]
@@ -47,19 +47,11 @@ if(CMAKE_COMPILER_IS_GNUCC)
       git checkout -- . && git checkout ${EIGEN_TAG} && patch -Nd
       ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor <
       ${tensor_random_header})
-  execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpfullversion -dumpversion
-                  OUTPUT_VARIABLE GCC_VERSION)
-  string(REGEX MATCHALL "[0-9]+" GCC_VERSION_COMPONENTS ${GCC_VERSION})
-  list(GET GCC_VERSION_COMPONENTS 0 GCC_MAJOR)
-  list(GET GCC_VERSION_COMPONENTS 1 GCC_MINOR)
-  set(GCC_VERSION "${GCC_MAJOR}.${GCC_MINOR}")
-  if(GCC_VERSION GREATER_EQUAL 12.0)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch
-         complex_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd
-        ${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header})
-  endif()
+  file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Complex.h.patch
+       complex_header)
+  set(EIGEN_PATCH_COMMAND
+      ${EIGEN_PATCH_COMMAND} && patch -Nd
+      ${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header})
 endif()
 
 set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index 529f72b662e3e..04bc95ec41acf 100755
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -42,24 +42,16 @@ if(WITH_GPU)
   endif()
 endif()
 
-if(CMAKE_COMPILER_IS_GNUCC)
-  execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpfullversion -dumpversion
-                  OUTPUT_VARIABLE GCC_VERSION)
-  string(REGEX MATCHALL "[0-9]+" GCC_VERSION_COMPONENTS ${GCC_VERSION})
-  list(GET GCC_VERSION_COMPONENTS 0 GCC_MAJOR)
-  list(GET GCC_VERSION_COMPONENTS 1 GCC_MINOR)
-  set(GCC_VERSION "${GCC_MAJOR}.${GCC_MINOR}")
-  if(GCC_VERSION GREATER_EQUAL "12.0")
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch
-         native_dst)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch
-         types_header)
-    # See: [Why calling some `git` commands before `patch`?]
-    set(GLOO_PATCH_COMMAND
-        git checkout -- . && git checkout ${GLOO_TAG} && patch -Nd
-        ${SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && patch -Nd
-        ${SOURCE_DIR}/gloo/ < ${types_header})
-  endif()
+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+  file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch
+       native_dst)
+  file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/types.h.patch
+       types_header)
+  # See: [Why calling some `git` commands before `patch`?]
+  set(GLOO_PATCH_COMMAND
+      git checkout -- . && git checkout ${GLOO_TAG} && patch -Nd
+      ${SOURCE_DIR}/gloo/transport/tcp < ${native_dst} && patch -Nd
+      ${SOURCE_DIR}/gloo/ < ${types_header})
 endif()
 
 file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/linux.cc.patch
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 3d730657062a0..af32edafe030d 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -4,9 +4,7 @@
 include(CheckCXXSourceRuns)
 include(CheckCXXSourceCompiles)
 
-if(CMAKE_COMPILER_IS_GNUCC
-   OR CMAKE_COMPILER_IS_GNUCXX
-   OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
   set(MMX_FLAG "-mmmx")
   set(SSE2_FLAG "-msse2")
   set(SSE3_FLAG "-msse3")

From 56a024d8369ea1ef9154a2a5b0a956b2c4665695 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Thu, 7 Mar 2024 09:58:27 +0800
Subject: [PATCH 033/114] prohibit the use of IR_ENFORCE (#62445)

* fix,test=document_fix

* fix,test=document_fix

* fix,test=document_fix

* fix,test=document_fix

* fix

* fix,test=document_fix

* fix,test=document_fix

* fix,test=document_fix

* fix,test=document_fix
---
 tools/check_file_diff_approvals.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 138492cbac579..a0a77ea2a11ce 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -343,12 +343,14 @@ if [ "${HAS_MODIFIED_FRAMEWORK_EXECUTOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; t
     check_approval 1 From00 zhangbo9674
 fi
 
+
 HAS_MODIFIED_DRR_INCLUDE_DIR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/pir/drr/include" || true`
 if [ "${HAS_MODIFIED_DRR_INCLUDE_DIR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must have one RD (yuanlehome, zyfncg) approval for file changes in paddle/fluid/pir/drr/include.\n"
     check_approval 1 yuanlehome zyfncg
 fi
 
+
 HAS_MODIFIED_PIR_INCLUDE_DIR=`git diff --name-only upstream/$BRANCH | grep "paddle/pir/include" || true`
 if [ "${HAS_MODIFIED_PIR_INCLUDE_DIR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must have one RD (yuanlehome, winter-wang, zhangbo9674) approval for file changes in paddle/pir/include.\n"
@@ -391,6 +393,14 @@ if [ "${HAS_MODIFIED_STATIC_BUILD}" != "" ] && [ "${GIT_PR_ID}" != ""]; then
     check_approval 1 From00 zhiqiu
 fi
 
+
+HAS_MODIFIED_ENFORCE_SYNTAX=`git diff upstream/$BRANCH | grep -E "IR_ENFORCE|CHECK_EQ|CHECK_NE|CHECK_LT|CHECK_LE|CHECK_GE|CHECK_GT|LOG\(FATAL\)" || true`
+if [ "${HAS_MODIFIED_ENFORCE_SYNTAX}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (rismeup1 or winter-wang) approval for using 'IR_ENFORCE, CHECK_EQ, CHECK_NE, CHECK_LT, CHECK_LE, CHECK_GE, CHECK_GT, LOG(FATAL)', it is recommended to use PADDLE_ENFORCE as a replacement,see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\n"
+    check_approval 1 risemeup1 winter-wang
+fi
+
+
 HAS_MODIFIED_TARGET_FOR_AUTO_PARALLEL_CI=`git diff --name-only upstream/$BRANCH | grep "tools/auto_parallel/target_path_lists.sh" || true`
 if [ "${HAS_MODIFIED_TARGET_FOR_AUTO_PARALLEL_CI}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must have one RD (zhiqiu(Recommend) or chenwhql) approval for file changes in tools/auto_parallel/target_path_lists.sh.\n"

From 600bdd579106ab8a97d26d313c5ac2869ab62df1 Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Thu, 7 Mar 2024 10:15:20 +0800
Subject: [PATCH 034/114] [SOT][3.12] Fix that `frame` in eval custom code was
 not released in `tstate` - step 2 (#62470)

---
 paddle/fluid/pybind/cpython_internals.c | 8 ++++++--
 paddle/fluid/pybind/cpython_internals.h | 1 +
 paddle/fluid/pybind/eval_frame.c        | 3 +++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pybind/cpython_internals.c b/paddle/fluid/pybind/cpython_internals.c
index 0e5329d6f1287..af7ede116e4b2 100644
--- a/paddle/fluid/pybind/cpython_internals.c
+++ b/paddle/fluid/pybind/cpython_internals.c
@@ -109,7 +109,7 @@ static void Internal_clear_thread_frame(PyThreadState *tstate,
          tstate->datastack_top);
   tstate->c_recursion_remaining--;
   assert(frame->frame_obj == NULL || frame->frame_obj->f_frame == frame);
-  Internal_PyFrame_Clear(frame);  // see _PyFrame_ClearExceptCode
+  Internal_PyFrame_ClearExceptCode(frame);
   Py_DECREF(frame->f_code);
   tstate->c_recursion_remaining++;
   Internal_PyThreadState_PopFrame(tstate, frame);
@@ -125,7 +125,7 @@ static void Internal_clear_gen_frame(PyThreadState *tstate,
   gen->gi_exc_state.previous_item = NULL;
   tstate->c_recursion_remaining--;
   assert(frame->frame_obj == NULL || frame->frame_obj->f_frame == frame);
-  Internal_PyFrame_Clear(frame);  // see _PyFrame_ClearExceptCode
+  Internal_PyFrame_ClearExceptCode(frame);
   tstate->c_recursion_remaining++;
   frame->previous = NULL;
 }
@@ -584,7 +584,11 @@ static void Internal_take_ownership(PyFrameObject *f,
 }
 
 // Call on 3.11 _PyFrame_Clear is called on 3.12+ _PyFrame_ClearExceptCode
+#if PY_VERSION_HEX >= 0x030c0000
+void Internal_PyFrame_ClearExceptCode(_PyInterpreterFrame *frame) {
+#else
 void Internal_PyFrame_Clear(_PyInterpreterFrame *frame) {
+#endif
   /* It is the responsibility of the owning generator/coroutine
    * to have cleared the enclosing generator, if any. */
   assert(frame->owner != FRAME_OWNED_BY_GENERATOR ||
diff --git a/paddle/fluid/pybind/cpython_internals.h b/paddle/fluid/pybind/cpython_internals.h
index 941279b88f870..fe8330312dc9e 100644
--- a/paddle/fluid/pybind/cpython_internals.h
+++ b/paddle/fluid/pybind/cpython_internals.h
@@ -43,6 +43,7 @@ void Internal_PyEvalFrameClearAndPop(PyThreadState *tstate,
                                      _PyInterpreterFrame *frame);
 _PyInterpreterFrame *Internal_PyThreadState_PushFrame(PyThreadState *tstate,
                                                       size_t size);
+void Internal_PyFrame_ClearExceptCode(_PyInterpreterFrame *frame);
 #endif
 
 #endif
diff --git a/paddle/fluid/pybind/eval_frame.c b/paddle/fluid/pybind/eval_frame.c
index 3e5b50211cdec..aa5a4c0022fcc 100644
--- a/paddle/fluid/pybind/eval_frame.c
+++ b/paddle/fluid/pybind/eval_frame.c
@@ -366,6 +366,9 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
     PyObject *result = PyObject_CallObject(callback, args);
     Py_DECREF(args);
     if (result == NULL) {
+#if PY_VERSION_HEX >= 0x030C0000
+      Internal_PyEvalFrameClearAndPop(tstate, frame);
+#endif
       return NULL;
     }
     code = PyObject_GetAttrString(result, "code");

From 13c0bd3cdafa2808c2ed422e3b48774a2fb738bd Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Thu, 7 Mar 2024 10:18:59 +0800
Subject: [PATCH 035/114] [PIR+CINN]Add SimplifyDimExpr for +-*/ min max
 broadcast (#62449)

* [PIR+CINN]Add SimplifyDimExpr for +-*/ min max broadcast

* fix ut

* fix ut

* fix UT

* fix ut
---
 paddle/pir/src/dialect/shape/utils/dim_expr.cc      | 13 +++++++++----
 .../pir/src/dialect/shape/utils/dim_expr_builder.cc |  7 ++++---
 test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc  |  8 ++++----
 .../cinn/symbolic/test_unary_op_infer_sym_shape.py  |  4 ++--
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/paddle/pir/src/dialect/shape/utils/dim_expr.cc b/paddle/pir/src/dialect/shape/utils/dim_expr.cc
index 618cb6914553c..9be0e894fe015 100644
--- a/paddle/pir/src/dialect/shape/utils/dim_expr.cc
+++ b/paddle/pir/src/dialect/shape/utils/dim_expr.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/pir/include/dialect/shape/utils/dim_expr.h"
 #include "paddle/pir/include/core/utils.h"
+#include "paddle/pir/include/dialect/shape/utils/dim_expr_simplify.h"
 
 namespace symbol {
 
@@ -21,7 +22,8 @@ DimExpr DimExpr::operator+(const DimExpr& other) const {
   if (this->isa<std::int64_t>() && other.isa<std::int64_t>()) {
     return this->dyn_cast<std::int64_t>() + other.dyn_cast<std::int64_t>();
   }
-  return Add<DimExpr>{List<DimExpr>{*this, other}};
+  DimExpr add_expr = Add<DimExpr>{List<DimExpr>{*this, other}};
+  return SimplifyDimExpr(add_expr);
 }
 
 DimExpr DimExpr::operator-(const DimExpr& other) const {
@@ -29,14 +31,16 @@ DimExpr DimExpr::operator-(const DimExpr& other) const {
     return this->dyn_cast<std::int64_t>() - other.dyn_cast<std::int64_t>();
   }
   const DimExpr& neg = Negative<DimExpr>(other);
-  return Add<DimExpr>{List<DimExpr>{*this, neg}};
+  DimExpr sub_expr = Add<DimExpr>{List<DimExpr>{*this, neg}};
+  return SimplifyDimExpr(sub_expr);
 }
 
 DimExpr DimExpr::operator*(const DimExpr& other) const {
   if (this->isa<std::int64_t>() && other.isa<std::int64_t>()) {
     return this->dyn_cast<std::int64_t>() * other.dyn_cast<std::int64_t>();
   }
-  return Mul<DimExpr>{List<DimExpr>{*this, other}};
+  DimExpr mul_expr = Mul<DimExpr>{List<DimExpr>{*this, other}};
+  return SimplifyDimExpr(mul_expr);
 }
 
 DimExpr DimExpr::operator/(const DimExpr& other) const {
@@ -48,7 +52,8 @@ DimExpr DimExpr::operator/(const DimExpr& other) const {
     }
   }
   const DimExpr& reciprocal = Reciprocal<DimExpr>(other);
-  return Mul<DimExpr>{List<DimExpr>{*this, reciprocal}};
+  DimExpr div_expr = Mul<DimExpr>{List<DimExpr>{*this, reciprocal}};
+  return SimplifyDimExpr(div_expr);
 }
 
 namespace {
diff --git a/paddle/pir/src/dialect/shape/utils/dim_expr_builder.cc b/paddle/pir/src/dialect/shape/utils/dim_expr_builder.cc
index cb49cdbf326fd..3278a9eb2681b 100644
--- a/paddle/pir/src/dialect/shape/utils/dim_expr_builder.cc
+++ b/paddle/pir/src/dialect/shape/utils/dim_expr_builder.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/pir/include/dialect/shape/utils/dim_expr_builder.h"
 #include "paddle/common/enforce.h"
+#include "paddle/pir/include/dialect/shape/utils/dim_expr_simplify.h"
 
 namespace symbol {
 
@@ -44,15 +45,15 @@ DimExpr DimExprBuilder::Div(const DimExpr& lhs, const DimExpr& rhs) {
 }
 
 DimExpr DimExprBuilder::Max(const DimExpr& lhs, const DimExpr& rhs) {
-  return MaxDimExpr{List<DimExpr>{lhs, rhs}};
+  return SimplifyDimExpr(MaxDimExpr{List<DimExpr>{lhs, rhs}});
 }
 
 DimExpr DimExprBuilder::Min(const DimExpr& lhs, const DimExpr& rhs) {
-  return MinDimExpr{List<DimExpr>{lhs, rhs}};
+  return SimplifyDimExpr(MinDimExpr{List<DimExpr>{lhs, rhs}});
 }
 
 DimExpr DimExprBuilder::Broadcast(const DimExpr& lhs, const DimExpr& rhs) {
-  return BroadcastDimExpr{List<DimExpr>{lhs, rhs}};
+  return SimplifyDimExpr(BroadcastDimExpr{List<DimExpr>{lhs, rhs}});
 }
 
 std::vector<DimExpr> DimExprBuilder::ConstShape(
diff --git a/test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc b/test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc
index a8665f73cff8a..5bfc8b5393fc6 100644
--- a/test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc
+++ b/test/cpp/pir/shape_dialect/symbol_dim_expr_test.cc
@@ -114,13 +114,13 @@ TEST(DimExpr, Equal) {
   DimExpr sym1 = DimExpr("S1");
   DimExpr constant1 = DimExpr(1);
   ASSERT_EQ(sym0 + sym1, sym0 + sym1);
-  ASSERT_NE(sym0 + sym1, sym1 + sym0);
+  ASSERT_EQ(sym0 + sym1, sym1 + sym0);
   ASSERT_EQ(sym0 + constant1, DimExpr("S0") + constant1);
   ASSERT_EQ(sym0 - sym1, sym0 - sym1);
   ASSERT_NE(sym0 - sym1, sym1 - sym0);
   ASSERT_EQ(sym0 - constant1, DimExpr("S0") - constant1);
   ASSERT_EQ(sym0 * sym1, sym0 * sym1);
-  ASSERT_NE(sym0 * sym1, sym1 * sym0);
+  ASSERT_EQ(sym0 * sym1, sym1 * sym0);
   ASSERT_EQ(sym0 * constant1, DimExpr("S0") * constant1);
   ASSERT_EQ(sym0 / sym1, sym0 / sym1);
   ASSERT_NE(sym0 / sym1, sym1 / sym0);
@@ -134,7 +134,7 @@ TEST(DimExpr, Equal) {
   ASSERT_EQ(builder.Min(sym0, constant1),
             builder.Min(DimExpr("S0"), constant1));
   ASSERT_EQ(builder.Broadcast(sym0, sym1), builder.Broadcast(sym0, sym1));
-  ASSERT_NE(builder.Broadcast(sym0, sym1), builder.Broadcast(sym1, sym0));
+  ASSERT_EQ(builder.Broadcast(sym0, sym1), builder.Broadcast(sym1, sym0));
   ASSERT_EQ(builder.Broadcast(sym0, constant1),
             builder.Broadcast(DimExpr("S0"), constant1));
 }
@@ -158,7 +158,7 @@ TEST(DimExpr, Hash) {
   DimExpr sym1 = DimExpr("S1");
   ASSERT_EQ((std::hash<DimExpr>()(sym0 + sym1)),
             (std::hash<DimExpr>()(sym0 + sym1)));
-  ASSERT_NE((std::hash<DimExpr>()(sym0 + sym1)),
+  ASSERT_EQ((std::hash<DimExpr>()(sym0 + sym1)),
             (std::hash<DimExpr>()(sym1 + sym0)));
   ASSERT_NE((std::hash<DimExpr>()(sym0 + sym1)),
             (std::hash<DimExpr>()(sym0 - sym1)));
diff --git a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
index be6741661295a..4f666b64f7bc3 100644
--- a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
+++ b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
@@ -175,7 +175,7 @@ def prepare_data(self):
         self.cases = [np.random.rand(4, 5, 6)]
         self.expected = [
             [
-                'shape[Mul(Mul(Mul(1, S0), S1), S2)], data[NULL]',
+                'shape[Mul(S0, S1, S2)], data[NULL]',
                 'shape[S0, S1, S2], data[NULL]',
             ]
         ]
@@ -229,7 +229,7 @@ def prepare_data(self):
         self.cases = [np.random.rand(4, 5, 6)]
         self.expected = [
             [
-                'shape[Mul(Mul(Mul(Mul(1, S0), S1), S2), 1 / (20)), 4, 5], data[NULL]',
+                'shape[Mul(S0, S1, S2, 1 / (20)), 4, 5], data[NULL]',
                 'shape[S0, S1, 12], data[NULL]',
             ]
         ]

From 03bf7c4f891f194be4a49d9b23cbcaf73df1d8d9 Mon Sep 17 00:00:00 2001
From: Frank Lin <eee4017@gmail.com>
Date: Thu, 7 Mar 2024 10:27:32 +0800
Subject: [PATCH 036/114] disable cuda malloc async when CUDA < 11.2 (#62264)

---
 paddle/fluid/platform/device/gpu/gpu_info.cc | 21 +++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 211f937faa75c..068243b61fae0 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -256,6 +256,7 @@ class RecordedGpuMallocHelper {
    * would be clear.
    */
   gpuError_t MallocAsync(void **ptr, size_t size, gpuStream_t stream) {
+#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
     LockGuardPtr<std::mutex> lock(mtx_);
     if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
       return gpuErrorOutOfMemory;
@@ -298,6 +299,10 @@ class RecordedGpuMallocHelper {
       // return cudaErrorMemoryAllocation directly here.
       return gpuErrorOutOfMemory;
     }
+#else
+    PADDLE_THROW(phi::errors::Unavailable(
+        "MallocAsync is not supported in this version of CUDA."));
+#endif
   }
 
   /**
@@ -338,6 +343,7 @@ class RecordedGpuMallocHelper {
   }
 
   void FreeAsync(void *ptr, size_t size, gpuStream_t stream) {
+#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
     // Purposefully allow cudaErrorCudartUnloading, because
     // that is returned if you ever call cudaFree after the
     // driver has already shutdown. This happens only if the
@@ -379,6 +385,11 @@ class RecordedGpuMallocHelper {
         "testing, should not use for release."));
     return nullptr;
 #endif
+
+#else
+    PADDLE_THROW(phi::errors::Unavailable(
+        "FreeAsync is not supported in this version of CUDA."));
+#endif
   }
 
   bool GetMemInfo(size_t *avail,
@@ -445,18 +456,22 @@ class RecordedGpuMallocHelper {
   const int dev_id_;
   const uint64_t limit_size_;
   std::atomic<uint64_t> cur_size_{0};
+
+#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
   cudaMemPool_t memPool_;
+  static std::once_flag set_cudamempoolattr_once_flag_;
+#endif
 
   mutable std::unique_ptr<std::mutex> mtx_;
-
   static std::once_flag once_flag_;
-  static std::once_flag set_cudamempoolattr_once_flag_;
-
   std::set<void *> gpu_ptrs;  // just for testing
 };                            // NOLINT
 
 std::once_flag RecordedGpuMallocHelper::once_flag_;
+
+#if defined(PADDLE_WITH_CUDA) && (CUDA_VERSION >= 11020)
 std::once_flag RecordedGpuMallocHelper::set_cudamempoolattr_once_flag_;
+#endif
 
 gpuError_t RecordedGpuMalloc(void **ptr,
                              size_t size,

From 2c34d763d36dbe62b1640a119eee591ab9aff02a Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Thu, 7 Mar 2024 10:30:17 +0800
Subject: [PATCH 037/114] Adjust the search path for libnccl.so (#62492)

* adpate libnccl.so in pdc

* adpate libnccl.so in pdc
---
 paddle/phi/backends/dynload/dynamic_loader.cc | 2 +-
 python/paddle/__init__.py                     | 6 +++++-
 python/setup.py.in                            | 4 +++-
 setup.py                                      | 4 ++++
 4 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 101f156e1f488..9399cc6ab61ff 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -587,7 +587,7 @@ void* GetNCCLDsoHandle() {
 #else
 #ifdef WITH_PIP_CUDA_LIBRARIES
   return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libnccl.so.2", true, {}, warning_msg);
+      FLAGS_nccl_dir, "libnccl.so;libnccl.so.2", true, {}, warning_msg);
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ed05ddeaf8ca6..7da75b5d6d6d4 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -565,7 +565,11 @@
     import os
     import platform
 
-    if platform.system() == 'Linux' and platform.machine() == 'x86_64':
+    if (
+        platform.system() == 'Linux'
+        and platform.machine() == 'x86_64'
+        and paddle.version.with_pip_cuda_libraries == 'ON'
+    ):
         package_dir = os.path.dirname(os.path.abspath(__file__))
         cublas_lib_path = package_dir + "/.." + "/nvidia/cublas/lib"
         set_flags({"FLAGS_cublas_dir": cublas_lib_path})
diff --git a/python/setup.py.in b/python/setup.py.in
index 5c2f941a65c80..b0bb259384967 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -135,6 +135,7 @@ is_tagged          = %(is_tagged)s
 commit           = '%(commit)s'
 with_mkl         = '%(with_mkl)s'
 cinn_version      = '%(cinn)s'
+with_pip_cuda_libraries       = '%(with_pip_cuda_libraries)s'
 
 __all__ = ['cuda', 'cudnn', 'nccl', 'show', 'xpu', 'xpu_xccl', 'xpu_xhpc']
 
@@ -357,7 +358,8 @@ def cinn():
             'commit': commit,
             'is_tagged': is_tagged(),
             'with_mkl': '@WITH_MKL@',
-            'cinn': get_cinn_version()})
+            'cinn': get_cinn_version(),
+            'with_pip_cuda_libraries': '@WITH_PIP_CUDA_LIBRARIES@'})
 
 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version/__init__.py')
 
diff --git a/setup.py b/setup.py
index 5550a3ee66f4f..309ebee69dde1 100644
--- a/setup.py
+++ b/setup.py
@@ -458,6 +458,7 @@ def write_version_py(filename='paddle/version/__init__.py'):
 commit           = '%(commit)s'
 with_mkl         = '%(with_mkl)s'
 cinn_version      = '%(cinn)s'
+with_pip_cuda_libraries       = '%(with_pip_cuda_libraries)s'
 
 __all__ = ['cuda', 'cudnn', 'nccl', 'show', 'xpu', 'xpu_xccl', 'xpu_xhpc']
 
@@ -682,6 +683,9 @@ def cinn():
                 'is_tagged': is_tagged(),
                 'with_mkl': env_dict.get("WITH_MKL"),
                 'cinn': get_cinn_version(),
+                'with_pip_cuda_libraries': env_dict.get(
+                    "with_pip_cuda_libraries"
+                ),
             }
         )
 

From c448d2898ebbf8f342fcb381edd6430aa130d39f Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Thu, 7 Mar 2024 10:33:32 +0800
Subject: [PATCH 038/114] [PIR][DynamicShape] Add nullary_infer_sym and binary
 nullary_infer_sym  (#62383)

* add nullary_infer_sym

* add infer
---
 .../infer_symbolic_shape/cinn_op_infer_sym.cc |   5 +-
 .../infer_symbolic_shape/cinn_op_infer_sym.h  |   1 -
 .../infer_sym_element_wise_binary.h           |   1 -
 .../infer_symbolic_shape.h                    |   1 +
 .../infer_symbolic_shape/nullary_infer_sym.cc |  74 ++++++++
 .../infer_symbolic_shape/nullary_infer_sym.h  |  22 +++
 .../paddle_op_infer_sym.cc                    |  79 +++------
 .../paddle_op_infer_sym.h                     |   9 -
 .../same_operands_and_result.cc               |   8 +
 .../same_operands_and_result.h                |   3 +-
 .../infer_symbolic_shape/unary_infer_sym.cc   | 115 ++++++++++++
 .../infer_symbolic_shape/unary_infer_sym.h    |   5 +-
 .../fluid/pir/dialect/operator/utils/utils.h  |   4 -
 .../dialect/shape/utils/shape_analysis.h      |   4 +
 .../test_binary_op_infer_sym_shape.py         | 112 ++++++++++++
 .../test_nullary_op_infer_sym_shape.py        | 156 ++++++++++++++++
 .../symbolic/test_unary_op_infer_sym_shape.py | 166 ++++++++++++++++++
 17 files changed, 692 insertions(+), 73 deletions(-)
 create mode 100644 paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.cc
 create mode 100644 paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.h
 create mode 100644 test/ir/pir/cinn/symbolic/test_binary_op_infer_sym_shape.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_nullary_op_infer_sym_shape.py

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index d52270e5b3b66..d5da282de676b 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -76,7 +76,7 @@ bool ConcatOpInferSymbolicShape(
       out_dims[axis] = out_dims[axis] + operand_shape_or_data.shape()[axis];
     }
 
-    for (size_t i = 1; i < rank; ++i) {
+    for (size_t i = 0; i < rank; ++i) {
       if (i == static_cast<size_t>(axis)) continue;
       paddle::dialect::details::BuildCstrEqForTensorListAlongAxis(
           shape_analysis, input_values, i);
@@ -85,6 +85,9 @@ bool ConcatOpInferSymbolicShape(
     return out_dims;
   };
 
+  VLOG(3) << "constraints size:"
+          << shape_analysis->CreateDimExprBuilder().constraints().size();
+
   symbol::ShapeOrDataDimExprs shape_data{
       symbol::TensorShapeOrDataDimExprs(GetOutDimExprs())};
 
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
index dc2794ac6f90b..b3cc2232a1f91 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
 namespace cinn::dialect {
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
index e392023aa0c33..65fa20c8e63e7 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
 namespace paddle::dialect {
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h
index 515eaaca1b348..c44f6c70fe33b 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h"
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.cc
new file mode 100644
index 0000000000000..d3e4b38b57a5b
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
+
+namespace paddle::dialect {
+
+bool EmptyOpInferSymbolicShape(pir::Operation *op,
+                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  const auto &shape_gen_op = op->operand_source(0).defining_op();
+  if (shape_gen_op->isa<paddle::dialect::FullIntArrayOp>()) {
+    std::vector<int64_t> shape = details::GetVectorAttr(
+        shape_gen_op->dyn_cast<paddle::dialect::FullIntArrayOp>(), "value");
+    std::vector<symbol::DimExpr> sym_dims;
+    sym_dims.reserve(shape.size());
+    for (const int64_t &dim : shape) {
+      sym_dims.emplace_back(symbol::DimExpr(dim));
+    }
+
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(sym_dims)};
+    shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+    return true;
+
+  } else {
+    pir::Value operand_source = op->operand_source(0);
+    const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
+        shape_analysis->GetShapeOrDataForValue(operand_source);
+
+    shape_analysis->SetShapeOrDataForValue(op->result(0),
+                                           operand_shape_or_data);
+    return true;
+  }
+}
+
+bool GaussianOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  const auto &shape_gen_op = op->operand_source(0).defining_op();
+
+  if (shape_gen_op->isa<paddle::dialect::FullIntArrayOp>()) {
+    std::vector<int64_t> shape = details::GetVectorAttr(
+        shape_gen_op->dyn_cast<paddle::dialect::FullIntArrayOp>(), "value");
+    std::vector<symbol::DimExpr> sym_dims;
+    sym_dims.reserve(shape.size());
+    for (const int64_t &dim : shape) {
+      sym_dims.emplace_back(symbol::DimExpr(dim));
+    }
+
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(sym_dims)};
+    shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+    return true;
+
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        op->name() +
+        " 's InferSymbolicShape interface is NOT implemented now."));
+    return true;
+  }
+}
+
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.h
new file mode 100644
index 0000000000000..7e706bf942f83
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/nullary_infer_sym.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
+
+namespace paddle::dialect {
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Empty)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Gaussian)
+}  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index 9003b88c18fd3..9192478548d51 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -97,7 +97,6 @@ bool StackOpInferSymbolicShape(pir::Operation *op,
           static_cast<std::int64_t>(shape_data_list.size()));
     } else {
       for (int i = 0; i < rank; ++i) {
-        if (i == axis) continue;
         details::BuildCstrEqForTensorListAlongAxis(
             shape_analysis, shape_data_list, i);
       }
@@ -931,26 +930,6 @@ bool SplitOpInferSymbolicShape(pir::Operation *op,
 }
 
 //  Not Implemented Ops.
-
-bool DiagEmbedOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool DiagonalOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool DirichletOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
 bool GatherOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   const auto &input_shape_or_data =
@@ -1020,17 +999,33 @@ bool GatherOpInferSymbolicShape(
 
 bool KronOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool KthvalueOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  const auto &x_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0)).shape();
+  const auto &y_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(1)).shape();
+  const int rank_x = x_shape_or_data.size();
+  const int rank_y = y_shape_or_data.size();
+  const int rank = (rank_x > rank_y) ? rank_x : rank_y;
+
+  std::vector<symbol::DimExpr> dim_out;
+  dim_out.reserve(rank);
+  const auto one = symbol::DimExpr{1};
+  const auto minus_one = symbol::DimExpr{-1};
+  for (int i = 0; i < rank; i++) {
+    symbol::DimExpr dim_xi =
+        (i < rank - rank_x) ? one : x_shape_or_data.at(i - (rank - rank_x));
+    symbol::DimExpr dim_yi =
+        (i < rank - rank_y) ? one : y_shape_or_data.at(i - (rank - rank_y));
+    dim_out.push_back(dim_xi * dim_yi);
+  }
+  symbol::ShapeOrDataDimExprs shape_data{
+      symbol::TensorShapeOrDataDimExprs(dim_out)};
+  pir::Value res = op->result(0);
+  shape_analysis->SetShapeOrDataForValue(res, shape_data);
   return true;
 }
 
+//  Not Impelmented Ops.
 bool LogcumsumexpOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -1095,32 +1090,6 @@ bool UniqueConsecutiveOpInferSymbolicShape(
       op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
   return true;
 }
-
-bool EinsumOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool EmptyOpInferSymbolicShape(pir::Operation *op,
-                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool Exponential_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-bool GaussianOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
 bool LinspaceOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
index 9ad13dd02933e..a84d71815549b 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
 namespace paddle::dialect {
@@ -51,12 +50,8 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Split)
 
 //  Not Impelmented Ops.
 
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(DiagEmbed)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Diagonal)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Dirichlet)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Gather)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Kron)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Kthvalue)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Logcumsumexp)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(MaskedSelect)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Poisson)
@@ -67,10 +62,6 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(TakeAlongAxis)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Topk)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Unbind)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(UniqueConsecutive)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Einsum)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Empty)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Exponential_)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Gaussian)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Linspace)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Logspace)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Logsumexp)
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
index bb540647d0219..f6d45dad1956a 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
@@ -154,6 +154,10 @@ bool Digamma_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
 }
+bool DirichletOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return SameOperandsAndResultShape(op, shape_analysis);
+}
 bool EqualOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
@@ -194,6 +198,10 @@ bool Expm1_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
 }
+bool Exponential_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return SameOperandsAndResultShape(op, shape_analysis);
+}
 bool FetchOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
index dc77d9cd70bb4..6afe08d753a55 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
 namespace paddle::dialect {
@@ -50,6 +49,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Cosh)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Cosh_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Digamma)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Digamma_)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Dirichlet)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Equal)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Equal_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Erf)
@@ -60,6 +60,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Exp)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Exp_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Expm1)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Expm1_)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Exponential_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Fetch)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Flip)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Floor)
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
index c2e17f1f8f8c6..42067e28e310a 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
@@ -165,6 +165,121 @@ bool Cumsum_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return CumsumOpInferSymbolicShape(op, shape_analysis);
 }
+bool DiagEmbedOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  pir::Value operand_source = op->operand_source(0);
+  const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(operand_source);
+  const auto &attributes = op->attributes();
+  int dim1 = attributes.at("dim1").dyn_cast<pir::Int32Attribute>().data();
+  int dim2 = attributes.at("dim2").dyn_cast<pir::Int32Attribute>().data();
+  int offset = attributes.at("offset").dyn_cast<pir::Int32Attribute>().data();
+
+  const auto &x_dims = operand_shape_or_data.shape();
+  int dim1_ = dim1 < 0 ? x_dims.size() + dim1 + 1 : dim1;
+  int dim2_ = dim2 < 0 ? x_dims.size() + dim2 + 1 : dim2;
+  int64_t offset_ = static_cast<int64_t>(std::abs(offset));
+  symbol::DimExpr new_dim_len =
+      symbol::DimExpr(offset_) + x_dims[x_dims.size() - 1];
+
+  const auto &out_dims = [&] {
+    std::vector<symbol::DimExpr> out_dims = x_dims;
+    out_dims.pop_back();
+    out_dims.insert(out_dims.begin() + std::min(dim1_, dim2_), new_dim_len);
+    out_dims.insert(out_dims.begin() + std::max(dim1_, dim2_), new_dim_len);
+    return out_dims;
+  }();
+  symbol::ShapeOrDataDimExprs shape_data{
+      symbol::TensorShapeOrDataDimExprs(out_dims)};
+  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+  return true;
+}
+bool DiagonalOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  pir::Value operand_source = op->operand_source(0);
+  const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(operand_source);
+  const auto &attributes = op->attributes();
+  int axis1 = attributes.at("axis1").dyn_cast<pir::Int32Attribute>().data();
+  int axis2 = attributes.at("axis2").dyn_cast<pir::Int32Attribute>().data();
+  int offset = attributes.at("offset").dyn_cast<pir::Int32Attribute>().data();
+
+  const auto &x_dims = operand_shape_or_data.shape();
+  int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1;
+  int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2;
+
+  auto out_dims = x_dims;
+  auto axis1_size = out_dims[axis1_];
+  auto axis2_size = out_dims[axis2_];
+  out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_));
+  out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_));
+
+  symbol::DimExprBuilder builder{nullptr};
+  symbol::DimExpr zero{0};
+  symbol::DimExpr res_shape;
+  symbol::DimExpr offset_sym{offset};
+  if (offset == 0) {
+    res_shape = builder.Min(axis1_size, axis2_size);
+  } else if (offset > 0) {
+    if (axis2_size.isa<int64_t>()) {
+      res_shape = (axis2_size.dyn_cast<int64_t>() - offset) > 0
+                      ? builder.Min(axis1_size, axis2_size - offset_sym)
+                      : zero;
+    } else {
+      res_shape = shape_analysis->GetNextSymName();
+    }
+  } else {
+    if (axis1_size.isa<int64_t>()) {
+      res_shape = (axis1_size.dyn_cast<int64_t>() + offset) > 0
+                      ? builder.Min(axis1_size + offset_sym, axis2_size)
+                      : zero;
+    } else {
+      res_shape = shape_analysis->GetNextSymName();
+    }
+  }
+  out_dims.push_back(res_shape);
+
+  symbol::ShapeOrDataDimExprs shape_data{
+      symbol::TensorShapeOrDataDimExprs(out_dims)};
+  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+  return true;
+}
+
+bool EinsumOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  return true;
+}
+
+bool KthvalueOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  pir::Value operand_source = op->operand_source(0);
+  const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(operand_source);
+  const auto &attributes = op->attributes();
+  int axis = attributes.at("axis").dyn_cast<pir::Int32Attribute>().data();
+  bool keepdim = GetBoolAttr(op, "keepdim");
+
+  const auto &input_dims = operand_shape_or_data.shape();
+  const int &dim_size = input_dims.size();
+  if (axis < 0) axis += dim_size;
+  std::vector<symbol::DimExpr> out_dims;
+  for (int i = 0; i < axis; i++) {
+    out_dims.emplace_back(input_dims[i]);
+  }
+  if (keepdim && dim_size > 0) {
+    out_dims.emplace_back(symbol::DimExpr(1));
+  }
+  for (int i = axis + 1; i < dim_size; i++) {
+    out_dims.emplace_back(input_dims[i]);
+  }
+  symbol::ShapeOrDataDimExprs shape_data{
+      symbol::TensorShapeOrDataDimExprs(out_dims)};
+  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+  shape_analysis->SetShapeOrDataForValue(op->result(1), shape_data);
+  return true;
+}
 bool ReshapeOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   pir::Value operand_source = op->operand_source(0);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
index 8d47e5a5fd91e..aeeb03713f481 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
 namespace paddle::dialect {
@@ -29,6 +28,10 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Cumprod)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Cumprod_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Cumsum)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Cumsum_)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(DiagEmbed)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Diagonal)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Einsum)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Kthvalue)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Reshape)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Reshape_)
 
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.h b/paddle/fluid/pir/dialect/operator/utils/utils.h
index a0248993caaaf..fd8ec68401b08 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.h
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.h
@@ -28,10 +28,6 @@ namespace dialect {
 
 using VariantType = phi::Attribute;
 
-#define OP_DECLARE_INFER_SYMBOLIC_SHAPE(name) \
-  bool name##OpInferSymbolicShape(            \
-      pir::Operation* op, pir::ShapeConstraintIRAnalysis* shape_analysis);
-
 // TODO(zhangbo): The builtin type needs to cover all data types of
 // phi::DataType.
 static inline phi::DataType TransToPhiDataType(pir::Type dtype) {
diff --git a/paddle/pir/include/dialect/shape/utils/shape_analysis.h b/paddle/pir/include/dialect/shape/utils/shape_analysis.h
index 284487b7210c5..04625f3047e40 100644
--- a/paddle/pir/include/dialect/shape/utils/shape_analysis.h
+++ b/paddle/pir/include/dialect/shape/utils/shape_analysis.h
@@ -100,4 +100,8 @@ class IR_API ShapeAnalysisManager {
   std::unordered_map<uint64_t, ShapeConstraintIRAnalysis> tables_;
 };
 
+#define OP_DECLARE_INFER_SYMBOLIC_SHAPE(name) \
+  bool name##OpInferSymbolicShape(            \
+      pir::Operation* op, pir::ShapeConstraintIRAnalysis* shape_analysis);
+
 }  // namespace pir
diff --git a/test/ir/pir/cinn/symbolic/test_binary_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_binary_op_infer_sym_shape.py
new file mode 100644
index 0000000000000..ab190bf57476e
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_binary_op_infer_sym_shape.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.static import InputSpec
+
+
+def get_sym_shape_str_for_op(net, input_spec, op_name='builtin.shadow_output'):
+    forward_program = net.forward.get_concrete_program(*input_spec)[
+        1
+    ].infer_program.forward_program
+    all_sym_shape_str = []
+    for op in forward_program.global_block().ops:
+        if op.name() == op_name:
+            all_sym_shape_str.append(op.attrs()['sym_shape_str'])
+
+    return all_sym_shape_str
+
+
+def apply_to_static(net, use_cinn, input_spec=None):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(
+        net,
+        input_spec=input_spec,
+        build_strategy=build_strategy,
+        full_graph=True,
+    )
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        self.prepare_data()
+
+    def prepare_data(self):
+        pass
+
+    def test_eval_symbolic(self):
+        pass
+
+
+class KronNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        y = paddle.empty(shape=[2, 2])
+        z = paddle.empty(shape=[3, 3])
+        out = paddle.kron(x, y)
+        out = paddle.kron(y, z)
+        return out
+
+
+class TestKronOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+
+        self.expected = [
+            [
+                'shape[Mul(S0, 1), Mul(S1, 2), Mul(S2, 2)], data[NULL]',
+                'shape[6, 6], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = KronNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.kron'
+            )
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                )
+
+        return True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_nullary_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_nullary_op_infer_sym_shape.py
new file mode 100644
index 0000000000000..1df40d9bcb4af
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_nullary_op_infer_sym_shape.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.static import InputSpec
+
+
+def get_sym_shape_str_for_op(net, input_spec, op_name='builtin.shadow_output'):
+    forward_program = net.forward.get_concrete_program(*input_spec)[
+        1
+    ].infer_program.forward_program
+    all_sym_shape_str = []
+    for op in forward_program.global_block().ops:
+        if op.name() == op_name:
+            all_sym_shape_str.append(op.attrs()['sym_shape_str'])
+
+    return all_sym_shape_str
+
+
+def apply_to_static(net, use_cinn, input_spec=None):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(
+        net,
+        input_spec=input_spec,
+        build_strategy=build_strategy,
+        full_graph=True,
+    )
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        self.prepare_data()
+
+    def prepare_data(self):
+        pass
+
+    def test_eval_symbolic(self):
+        pass
+
+
+class EmptyNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out = paddle.empty(shape=[128, 32])
+        out = paddle.empty(shape=x)
+        return out
+
+
+class TestEmptyOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                'shape[128, 32], data[NULL]',
+                'shape[S0, S1, S2], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = EmptyNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.empty'
+            )
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                )
+
+        return True
+
+
+class GaussianNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out = paddle.tensor.random.gaussian(shape=[12, 32], mean=1.0, std=2.0)
+        return out
+
+
+class TestGaussianOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                'shape[12, 32], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = GaussianNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(shape=[None, None, 2], dtype='float32')
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.gaussian'
+            )
+
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                )
+
+        return True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
index 4f666b64f7bc3..a740b47542ccf 100644
--- a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
+++ b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
@@ -265,5 +265,171 @@ def test_eval_symbolic(self):
         return True
 
 
+class DiagEmbedNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        data = paddle.empty([6])
+        out = paddle.diag_embed(data)
+        out = paddle.diag_embed(data, offset=-1, dim1=0, dim2=1)
+        out = paddle.diag_embed(x)
+        out = paddle.diag_embed(x, offset=-1, dim1=0, dim2=1)
+        return out
+
+
+class TestDiagEmbedOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                'shape[6, 6], data[NULL]',
+                'shape[7, 7], data[NULL]',
+                'shape[S0, S1, Add(0, S2), Add(0, S2)], data[NULL]',
+                'shape[Add(1, S2), Add(1, S2), S0, S1], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = DiagEmbedNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.diag_embed'
+            )
+
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                )
+
+        return True
+
+
+class DiagonalNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        data = paddle.empty([2, 2, 3], 'float32')
+        out = paddle.diagonal(data)
+        out = paddle.diagonal(data, offset=0, axis1=2, axis2=1)
+        out = paddle.diagonal(x)
+        out = paddle.diagonal(x, offset=0, axis1=2, axis2=1)
+        out = paddle.diagonal(x, offset=1, axis1=2, axis2=1)
+        out = paddle.diagonal(x, offset=-1, axis1=2, axis2=1)
+        return out
+
+
+class TestDiagonalOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                'shape[3, Min(2, 2)], data[NULL]',
+                'shape[2, Min(3, 2)], data[NULL]',
+                'shape[S2, Min(S0, S1)], data[NULL]',
+                'shape[S0, Min(S2, S1)], data[NULL]',
+                'shape[S0, S3], data[NULL]',
+                'shape[S0, S4], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = DiagonalNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.diagonal'
+            )
+
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                )
+
+        return True
+
+
+class KthvalueNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        data = paddle.empty([2, 3, 3], 'float32')
+        out = paddle.kthvalue(data, 2, 1)
+        return out
+
+
+class TestKthvalueOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 5, 6)]
+        self.expected = [
+            [
+                'shape[2, 3], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = KthvalueNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.kthvalue'
+            )
+
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                )
+
+        return True
+
+
 if __name__ == '__main__':
     unittest.main()

From cc1be3e84beb72f5450168b4fefd9d2b0e5fefb6 Mon Sep 17 00:00:00 2001
From: Reese Wang <rewang@nvidia.com>
Date: Thu, 7 Mar 2024 10:50:28 +0800
Subject: [PATCH 039/114] Enhance several unit tests (#62477)

* Manually release predictor_tuned

Signed-off-by: rewang <rewang@nvidia.com>

* Add indices to no_cast_list to keep it as fp32

Signed-off-by: rewang <rewang@nvidia.com>

* Set both atol and rtol for the fp16 test_trt_convert_solve

Signed-off-by: rewang <rewang@nvidia.com>

* Merge branch 'rewang/fix_test_sparse_fused_attention_seed' into 'nv-2.6.0'

Fix test_sparse_fused_attention random seed

See merge request dl/paddle/paddle!312

---------

Signed-off-by: rewang <rewang@nvidia.com>
Co-authored-by: Ryan Jeng <rjeng@nvidia.com>
---
 test/cpp/inference/api/trt_dynamic_shape_test.cc   | 1 +
 test/ir/inference/test_trt_convert_lookup_table.py | 1 +
 test/ir/inference/test_trt_convert_solve.py        | 2 +-
 test/legacy_test/test_sparse_fused_attention_op.py | 5 +++++
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/test/cpp/inference/api/trt_dynamic_shape_test.cc b/test/cpp/inference/api/trt_dynamic_shape_test.cc
index bbfdc0a2cd228..c6f6f8b16d358 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_test.cc
@@ -191,6 +191,7 @@ void TestTunedDynamic() {
     output_t->copy_to_cpu(out_data.data());
   };
   check_func(predictor_tuned.get());
+  predictor_tuned.reset(nullptr);
 
   // check tuned_dynamic_shape
   AnalysisConfig config;
diff --git a/test/ir/inference/test_trt_convert_lookup_table.py b/test/ir/inference/test_trt_convert_lookup_table.py
index e1fb64bcdf545..b7cf7d657d7a0 100644
--- a/test/ir/inference/test_trt_convert_lookup_table.py
+++ b/test/ir/inference/test_trt_convert_lookup_table.py
@@ -80,6 +80,7 @@ def generate_input2(dims, attrs: List[Dict[str, Any]]):
                     )
                 },
                 outputs=["out_data"],
+                no_cast_list=["indices"],
             )
 
             yield program_config
diff --git a/test/ir/inference/test_trt_convert_solve.py b/test/ir/inference/test_trt_convert_solve.py
index c3117ee335740..f12fb453a48f6 100644
--- a/test/ir/inference/test_trt_convert_solve.py
+++ b/test/ir/inference/test_trt_convert_solve.py
@@ -89,7 +89,7 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-3
+        yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
diff --git a/test/legacy_test/test_sparse_fused_attention_op.py b/test/legacy_test/test_sparse_fused_attention_op.py
index 68cdd16d4bd12..098f4815b85f3 100644
--- a/test/legacy_test/test_sparse_fused_attention_op.py
+++ b/test/legacy_test/test_sparse_fused_attention_op.py
@@ -42,6 +42,7 @@ def get_cuda_version():
 )
 class TestSparseAttentionAPI1(unittest.TestCase):
     def setUp(self):
+        paddle.seed(0)
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 128
@@ -134,6 +135,7 @@ def test_dygraph(self):
 
 class TestSparseAttentionAPI2(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 128
@@ -144,6 +146,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI3(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512
@@ -154,6 +157,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI4(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512
@@ -164,6 +168,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI5(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512

From 1128c78b68d6c41043e0052dbd1d5f6837a09728 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 7 Mar 2024 10:59:21 +0800
Subject: [PATCH 040/114] [PIR] refine onednn add_n (#62471)

* refine onednn add_n

* refine
---
 .../ir_adaptor/translator/op_translator.cc      | 17 ++++-------------
 .../fluid/pir/dialect/operator/ir/onednn.yaml   | 10 ----------
 paddle/fluid/pir/dialect/operator/ir/ops.yaml   | 12 ++++++++----
 .../dialect/operator/ir/ops_onednn_extra.yaml   |  2 +-
 4 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 3f60f63266b93..6a7e8a4dd5b44 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -1367,19 +1367,10 @@ struct ShadowOutputOpTranscriber : public OpTranscriber {
 struct AddNOpTranscriber : public OpTranscriber {
   pir::OpInfo LookUpOpInfo(pir::IrContext* ctx,
                            const OpDesc& op_desc) override {
-    auto prefix = GetPrefix(ctx, op_desc);
-    std::string target_op_name;
-#ifdef PADDLE_WITH_DNNL
-    if (prefix == kOneDNNTargetDialectPrefix) {
-      target_op_name = std::string(kOneDNNTargetDialectPrefix) + "add_n_onednn";
-    } else  // NOLINT
-#endif
-    {
-      target_op_name =
-          GetPrefix(ctx, op_desc) + OpNameCompatibleMapping(op_desc.Type());
-      if (IsInplace(op_desc)) {
-        target_op_name += "_";
-      }
+    std::string target_op_name =
+        GetPrefix(ctx, op_desc) + OpNameCompatibleMapping(op_desc.Type());
+    if (IsInplace(op_desc)) {
+      target_op_name += "_";
     }
 
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
diff --git a/paddle/fluid/pir/dialect/operator/ir/onednn.yaml b/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
index 18a799dfb28a9..282dd35cb3453 100644
--- a/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/onednn.yaml
@@ -1,13 +1,3 @@
-- op : add_n_onednn
-  args : (Tensor[] inputs)
-  output : Tensor(out)
-  infer_meta:
-    func: AddNInferMeta
-    param: [inputs]
-  kernel:
-    func: add_n
-    param: [inputs]
-
 - op : dequantize
   args : (Tensor input, float scale=1.0, float shift=0.0)
   output : Tensor(output)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 6a655d9851ec5..616695fad5149 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -28,12 +28,16 @@
     support_trans_dtype : x, y
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
-# this add_n is only for ops_api_gen.py
+# this add_n is only for ops_api_gen.py and onednn
 - op : add_n
   args : (Tensor[] inputs)
-  output : Tensor
-  invoke : add_n_impl(inputs)
-  backward : add_n_grad
+  output : Tensor(out)
+  infer_meta:
+    func: AddNInferMeta
+    param: [inputs]
+  kernel:
+    func: add_n
+    param: [inputs]
 
 - op : all
   args : (Tensor x, int64_t[] axis={}, bool keepdim=false)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index 39ae6203cfd43..2e16dfce8cacf 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -15,7 +15,7 @@
 
 - op : abs_grad
 
-- op : add_n_onednn
+- op : add_n
   extra_args : str mkldnn_data_type="float32"
 
 - op : batch_norm

From be55c7b6aa03bcacf818f4a4373312539832f4fe Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 7 Mar 2024 10:59:55 +0800
Subject: [PATCH 041/114]  Fix axies -> axes (#62481)

---
 .../hlir/framework/pir/op_lowering_util.cc    |  2 +-
 paddle/cinn/hlir/pe/ir_schedule_pe.cc         |  2 +-
 paddle/cinn/hlir/pe/schedule.cc               |  2 +-
 paddle/cinn/ir/tensor.cc                      |  6 ++--
 paddle/cinn/poly/isl_utils.cc                 | 32 +++++++++----------
 paddle/cinn/poly/isl_utils.h                  | 19 ++++++-----
 paddle/cinn/poly/stage.cc                     | 22 ++++++-------
 7 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_util.cc b/paddle/cinn/hlir/framework/pir/op_lowering_util.cc
index 038908ff1ab99..d493f0a99b67d 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_util.cc
@@ -727,7 +727,7 @@ void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,  // NOLINT
           // the loop size at axis is 1, need remove
           axes_shift_num[j] = -1;
         } else if (axes[j] > idx) {
-          // the axies value need left shift
+          // the axes value need left shift
           axes_shift_num[j]++;
         }
       }
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
index 36052d25f8a44..71b52d12493e9 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -200,7 +200,7 @@ std::vector<cinn::common::CINNValue> IRCudaScheduleMatMul(
   ir_sch.MergeExprs();
   // Generally, there are 2 ScheduleBlocks in the lowered function,
   // the first is for reduce_init and the second is the real compute block,
-  // here we use loops of the first block to Bind GPU index in top spatial axies
+  // here we use loops of the first block to Bind GPU index in top spatial axes
   auto init_block = ir_sch.GetAllBlocks().front();
   VLOG(3) << "Matmul lowered expr:\n" << ir_sch.GetModule().GetExprs().front();
 
diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc
index 3c3067ce436ab..aea041783114a 100644
--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -290,7 +290,7 @@ void MatmulScheduleCPU(poly::StageMap stages,
   for (int i = 0; i < all_axes_inner.size(); ++i) {
     all_axes.push_back(all_axes_inner[i]);
   }
-  // int axies
+  // int axes
   CHECK_EQ(all_axes.size(), out_axis_dims);
   if (is_k_splited) {
     if (is_m_splited || is_n_splited) {
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index 5224a2172ac5c..c2ba20487e2a8 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -359,7 +359,7 @@ ir::Tensor _Tensor_::InitReduction(poly::StageMap stages,
   std::vector<std::string> reduce_axis_input =
       stages[this]->origin_reduce_axis_names();
   auto origin_domain = stages[this]->domain();
-  auto reduce_axis_output = poly::GetRelatedOutputAxies(
+  auto reduce_axis_output = poly::GetRelatedOutputAxes(
       temp_transform, origin_domain, reduce_axis_input);
   std::set<std::string> reduce_axis_output_set;
   for (auto &i : reduce_axis_output) {
@@ -374,7 +374,7 @@ ir::Tensor _Tensor_::InitReduction(poly::StageMap stages,
     }
   }
 
-  temp_transform = poly::RemoveAxiesByOutputNames(
+  temp_transform = poly::RemoveAxesByOutputNames(
       temp_transform, origin_domain, reduce_axis_output);
 
   //! When the first axis is not reduce axis, do ComputeAt.
@@ -386,7 +386,7 @@ ir::Tensor _Tensor_::InitReduction(poly::StageMap stages,
     init_tensor->shape = shape;
     return init_tensor;
   }
-  //! When reduce axies are reordered to front, ComputeAt is illegal.
+  //! When reduce axes are reordered to front, ComputeAt is illegal.
   //! So we just copy transform and forloopInfo.
   isl_map_set_tuple_name(
       temp_transform.get(), isl_dim_in, init_reduce_tensor_name.c_str());
diff --git a/paddle/cinn/poly/isl_utils.cc b/paddle/cinn/poly/isl_utils.cc
index ed3a9b7f86e15..8262db4f14e29 100644
--- a/paddle/cinn/poly/isl_utils.cc
+++ b/paddle/cinn/poly/isl_utils.cc
@@ -422,14 +422,14 @@ isl::set isl_set_dim_name_if_null(
   return isl::manage(set);
 }
 
-isl::map RemoveAxiesByInputNames(const isl::map &x,
-                                 const isl::set &origin_domain,
-                                 const std::vector<std::string> &dim_in_names) {
+isl::map RemoveAxesByInputNames(const isl::map &x,
+                                const isl::set &origin_domain,
+                                const std::vector<std::string> &dim_in_names) {
   std::string map_str = isl_map_to_str(x.get());
   isl::ctx this_ctx = x.ctx();
   isl::map temp_transform(this_ctx, map_str);
   auto related_output_names =
-      GetRelatedOutputAxies(x, origin_domain, dim_in_names);
+      GetRelatedOutputAxes(x, origin_domain, dim_in_names);
   if (dim_in_names.empty()) return temp_transform;
   for (auto &i : dim_in_names) {
     temp_transform = isl::manage(isl_remove_axis_by_name(
@@ -442,7 +442,7 @@ isl::map RemoveAxiesByInputNames(const isl::map &x,
   return temp_transform;
 }
 
-isl::map RemoveAxiesByOutputNames(
+isl::map RemoveAxesByOutputNames(
     const isl::map &x,
     const isl::set &origin_domain,
     const std::vector<std::string> &dim_out_names) {
@@ -450,7 +450,7 @@ isl::map RemoveAxiesByOutputNames(
   isl::ctx this_ctx = x.ctx();
   isl::map temp_transform(this_ctx, map_str);
   auto related_input_names =
-      GetRelatedInputAxies(x, origin_domain, dim_out_names);
+      GetRelatedInputAxes(x, origin_domain, dim_out_names);
   if (dim_out_names.empty()) return temp_transform;
   for (auto &i : dim_out_names) {
     temp_transform = isl::manage(isl_remove_axis_by_name(
@@ -463,24 +463,24 @@ isl::map RemoveAxiesByOutputNames(
   return temp_transform;
 }
 
-std::vector<std::string> GetRelatedOutputAxies(
+std::vector<std::string> GetRelatedOutputAxes(
     const isl::map &x,
     const isl::set &origin_domain,
     const std::vector<std::string> &dim_in_names) {
   std::string map_str = isl_map_to_str(x.get());
-  VLOG(1) << "GetRelatedOutputAxies map_str is : " << map_str;
+  VLOG(1) << "GetRelatedOutputAxes map_str is : " << map_str;
   isl::ctx this_ctx = x.ctx();
   isl::map temp_transform(this_ctx, map_str);
   auto dim_out_names = isl_get_dim_names(temp_transform, isl_dim_out);
   std::set<std::string> dim_in_set;
   for (auto &i : dim_in_names) {
-    VLOG(1) << "GetRelatedOutputAxies dim_in_names is : " << i;
+    VLOG(1) << "GetRelatedOutputAxes dim_in_names is : " << i;
     dim_in_set.insert(i);
   }
   std::set<std::string> res_set;
   for (auto &i : dim_out_names) {
     auto related_in_dim =
-        GetRelatedInputAxies(temp_transform, origin_domain, {i});
+        GetRelatedInputAxes(temp_transform, origin_domain, {i});
     for (auto &j : related_in_dim) {
       if (dim_in_set.count(j) > 0) {
         res_set.insert(i);
@@ -489,24 +489,24 @@ std::vector<std::string> GetRelatedOutputAxies(
   }
   std::vector<std::string> res;
   for (auto &i : res_set) {
-    VLOG(1) << "GetRelatedOutputAxies res is : " << i;
+    VLOG(1) << "GetRelatedOutputAxes res is : " << i;
     res.push_back(i);
   }
   return res;
 }
 
-std::vector<std::string> GetRelatedInputAxies(
+std::vector<std::string> GetRelatedInputAxes(
     const isl::map &x,
     const isl::set &origin_domain,
     const std::vector<std::string> &dim_out_names,
     bool strict) {
   std::string map_str = isl_map_to_str(x.get());
-  VLOG(1) << "GetRelatedInputAxies map_str is : " << map_str;
+  VLOG(1) << "GetRelatedInputAxes map_str is : " << map_str;
   isl::ctx this_ctx = x.ctx();
   isl::map temp_transform(this_ctx, map_str);
   auto dim_in_names = isl_get_dim_names(temp_transform, isl_dim_in);
   for (auto &i : dim_out_names) {
-    VLOG(1) << "GetRelatedInputAxies dim_out_names is : " << i;
+    VLOG(1) << "GetRelatedInputAxes dim_out_names is : " << i;
     temp_transform = isl::manage(isl_remove_axis_by_name(
         temp_transform.release(), isl_dim_out, i.c_str()));
   }
@@ -526,10 +526,10 @@ std::vector<std::string> GetRelatedInputAxies(
   }
   for (auto &i : dim_in_names) {
     if (utils::Count(&map_str, i) != utils::Count(&deleted_map, i)) {
-      VLOG(1) << "GetRelatedInputAxies res is : " << i;
+      VLOG(1) << "GetRelatedInputAxes res is : " << i;
       res.push_back(i);
     } else if (out_set_without_suffix.count(i) > 0 && !strict) {
-      VLOG(1) << "GetRelatedInputAxies res is : " << i;
+      VLOG(1) << "GetRelatedInputAxes res is : " << i;
       res.push_back(i);
     } else if (out_set.count(i) > 0) {
       auto range1 = isl_set_get_axis_range_by_name(origin_domain.get(), i);
diff --git a/paddle/cinn/poly/isl_utils.h b/paddle/cinn/poly/isl_utils.h
index d9ae0ca65de82..6b74aadc73816 100644
--- a/paddle/cinn/poly/isl_utils.h
+++ b/paddle/cinn/poly/isl_utils.h
@@ -122,9 +122,9 @@ isl::set SetGetDims(isl::set set, const std::vector<int>& dims);
  * @param dim_in_names The names of input dims to remove.
  * @return The edited map.
  */
-isl::map RemoveAxiesByInputNames(const isl::map& x,
-                                 const isl::set& origin_domain,
-                                 const std::vector<std::string>& dim_in_names);
+isl::map RemoveAxesByInputNames(const isl::map& x,
+                                const isl::set& origin_domain,
+                                const std::vector<std::string>& dim_in_names);
 
 /**
  * Given an isl::map and a vector of names of dim_out,
@@ -133,22 +133,21 @@ isl::map RemoveAxiesByInputNames(const isl::map& x,
  * @param dim_in_names The names of output dims to remove.
  * @return The edited map.
  */
-isl::map RemoveAxiesByOutputNames(
-    const isl::map& x,
-    const isl::set& origin_domain,
-    const std::vector<std::string>& dim_out_names);
+isl::map RemoveAxesByOutputNames(const isl::map& x,
+                                 const isl::set& origin_domain,
+                                 const std::vector<std::string>& dim_out_names);
 
 /**
  * Given an isl::map and a vector of names of dim_out,
  * get the names of related input dims.
  * @param x The input map.
  * @param dim_out_names The names of output dims.
- * @param strict Indicates whether computes the strictly related input axies.
+ * @param strict Indicates whether computes the strictly related input axes.
  * For example, if strict == true, then input 'j' is related to output
  * 'j_outer_inner_outer'
  * @return The vector of names of related input dims.
  */
-std::vector<std::string> GetRelatedInputAxies(
+std::vector<std::string> GetRelatedInputAxes(
     const isl::map& x,
     const isl::set& origin_domain,
     const std::vector<std::string>& dim_out_names,
@@ -161,7 +160,7 @@ std::vector<std::string> GetRelatedInputAxies(
  * @param dim_in_names The names of input dims.
  * @return The vector of names of related output dims.
  */
-std::vector<std::string> GetRelatedOutputAxies(
+std::vector<std::string> GetRelatedOutputAxes(
     const isl::map& x,
     const isl::set& origin_domain,
     const std::vector<std::string>& dim_in_names);
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index aca5e548f09fb..60ae01782770d 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -441,7 +441,7 @@ void Stage::EditTempTensor(Stage *other, int level) {
       }
     }
     // Iterators of loop within level will be erased.
-    auto related_dim_in = GetRelatedInputAxies(
+    auto related_dim_in = GetRelatedInputAxes(
         this->transform(), this->domain(), {transform_domain_names[i]});
     for (auto &j : related_dim_in) {
       erase_var.insert(j);
@@ -460,27 +460,27 @@ void Stage::EditTempTensor(Stage *other, int level) {
       if (bind_info[new_i].for_type == ir::ForType::GPUBlock &&
           (this->scope() == ScopeKind::kShared ||
            this->scope() == ScopeKind::kLocal)) {
-        auto related_dim_in = GetRelatedInputAxies(
+        auto related_dim_in = GetRelatedInputAxes(
             this->transform(), this->domain(), {transform_domain_names[i]});
         for (auto &j : related_dim_in) {
           erase_var.insert(j);
         }
       } else if (bind_info[new_i].for_type == ir::ForType::GPUThread &&
                  (this->scope() == ScopeKind::kLocal)) {
-        auto related_dim_in = GetRelatedInputAxies(
+        auto related_dim_in = GetRelatedInputAxes(
             this->transform(), this->domain(), {transform_domain_names[i]});
         for (auto &j : related_dim_in) {
           erase_var.insert(j);
         }
       } else {
-        auto related_dim_in = GetRelatedInputAxies(
+        auto related_dim_in = GetRelatedInputAxes(
             this->transform(), this->domain(), {transform_domain_names[i]});
         for (auto &j : related_dim_in) {
           undo_erase_var.insert(j);
         }
       }
     } else {
-      auto related_dim_in = GetRelatedInputAxies(
+      auto related_dim_in = GetRelatedInputAxes(
           this->transform(), this->domain(), {transform_domain_names[i]});
       for (auto &j : related_dim_in) {
         undo_erase_var.insert(j);
@@ -608,9 +608,9 @@ void Stage::ComputeAt(Stage *other, int level) {
     level_out_dims.push_back(target_map_dims[i]);
     related_output_dims_set.insert(target_map_dims[i]);
   }
-  auto related_input_dims = GetRelatedInputAxies(
+  auto related_input_dims = GetRelatedInputAxes(
       new_target_transform, other->domain(), level_out_dims);
-  auto related_output_dims = GetRelatedOutputAxies(
+  auto related_output_dims = GetRelatedOutputAxes(
       new_target_transform, other->domain(), related_input_dims);
   for (auto &i : related_output_dims) {
     related_output_dims_set.insert(i);
@@ -708,7 +708,7 @@ void Stage::ComputeAt(Stage *other, int level) {
       int max_iv = maxv.get_num_si();
       int min_iv = minv.get_num_si();
       auto related_input_dims =
-          GetRelatedInputAxies(trans_res, domain_, {trans_dim_out[i]}, true);
+          GetRelatedInputAxes(trans_res, domain_, {trans_dim_out[i]}, true);
       if (max_iv != min_iv && related_input_dims.empty()) {
         trans_res = isl::manage(isl_remove_axis_by_name(
             trans_res.release(), isl_dim_out, trans_dim_out[i].c_str()));
@@ -1627,7 +1627,7 @@ void Stage::AddForloopInfo(int level, const StageForloopInfo &info) {
 }
 
 void Stage::CopyTransform(Stage *other, int level) {
-  auto target_transform = RemoveAxiesByInputNames(
+  auto target_transform = RemoveAxesByInputNames(
       other->transform(), other->domain(), other->origin_reduce_axis_names());
   isl::set target_origin_domain(other->domain().ctx(),
                                 isl_set_to_str(other->domain().get()));
@@ -1654,9 +1654,9 @@ void Stage::CopyTransform(Stage *other, int level) {
       dim_out_level.push_back(
           isl_map_get_dim_name(temp_target_trans.get(), isl_dim_out, i));
     }
-    auto related_dim_in = GetRelatedInputAxies(
+    auto related_dim_in = GetRelatedInputAxes(
         temp_target_trans, target_origin_domain, dim_out_level);
-    auto related_dim_out = GetRelatedOutputAxies(
+    auto related_dim_out = GetRelatedOutputAxes(
         temp_target_trans, target_origin_domain, related_dim_in);
     for (auto &i : related_dim_out) {
       if (i == pivot_dim_out) {

From 928c35add0a8046cb0e76ab2db51aaadad9811c2 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:00:28 +0800
Subject: [PATCH 042/114]  Update alterlayout.cc (#62465)

---
 paddle/cinn/hlir/pass/alterlayout.cc | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/paddle/cinn/hlir/pass/alterlayout.cc b/paddle/cinn/hlir/pass/alterlayout.cc
index 4e7df28e7994a..438a7e997d3f9 100644
--- a/paddle/cinn/hlir/pass/alterlayout.cc
+++ b/paddle/cinn/hlir/pass/alterlayout.cc
@@ -323,7 +323,7 @@ void AlterLayoutPass(Graph* graph) {
                     src_input_layout,
                     dst_input_layout,
                     cinn::common::UniqName(node->op()->name +
-                                           "_input_layout_tranform"));
+                                           "_input_layout_transform"));
             UpdateInferInfos(input_trans_node,
                              {input_shape},
                              {input_type},
@@ -371,7 +371,7 @@ void AlterLayoutPass(Graph* graph) {
                     src_kernel_layout,
                     dst_kernel_layout,
                     cinn::common::UniqName(node->op()->name +
-                                           "_weight_layout_tranform"));
+                                           "_weight_layout_transform"));
             UpdateInferInfos(weight_trans_node,
                              {weight_shape},
                              {weight_type},
@@ -512,7 +512,8 @@ void AlterLayoutPass(Graph* graph) {
                 layout_dict[source->id()] = src_layout;
                 auto input_data = source->safe_as<NodeData>();
                 CHECK(input_data);
-                VLOG(3) << source->id() << " do layout_tranform from C to NCHW";
+                VLOG(3) << source->id()
+                        << " do layout_transform from C to NCHW";
                 std::string op_type = "broadcast_to";
                 auto trans_node = new Node(
                     Operator::Get(op_type),
@@ -543,7 +544,7 @@ void AlterLayoutPass(Graph* graph) {
                 NodeData* new_output_data;
                 Node* new_trans_node;
                 VLOG(3) << new_input_data->id()
-                        << " do layout_tranform from NCHW to NCHWxc";
+                        << " do layout_transform from NCHW to NCHWxc";
                 std::tie(new_trans_node, new_output_data) =
                     InsertLayoutTransformNodeAfter(
                         graph,
@@ -553,7 +554,7 @@ void AlterLayoutPass(Graph* graph) {
                         new_src_layout,
                         new_input_layouts[i],
                         cinn::common::UniqName(new_input_data->id() +
-                                               "_layout_tranform"));
+                                               "_layout_transform"));
                 UpdateInferInfos(new_trans_node,
                                  {shape_dict[new_input_data->id()]},
                                  {input_types[i]},
@@ -577,7 +578,7 @@ void AlterLayoutPass(Graph* graph) {
                 NodeData* output_data;
                 Node* trans_node;
                 VLOG(3) << source->id()
-                        << " do layout_tranform from NCHW to NCHWxc";
+                        << " do layout_transform from NCHW to NCHWxc";
                 std::tie(trans_node, output_data) =
                     InsertLayoutTransformNodeAfter(
                         graph,
@@ -587,7 +588,7 @@ void AlterLayoutPass(Graph* graph) {
                         src_layout,
                         new_input_layouts[i],
                         cinn::common::UniqName(source->id() +
-                                               "_layout_tranform"));
+                                               "_layout_transform"));
                 UpdateInferInfos(trans_node,
                                  {input_shapes[i]},
                                  {input_types[i]},
@@ -611,7 +612,7 @@ void AlterLayoutPass(Graph* graph) {
                 NodeData* output_data;
                 Node* trans_node;
                 VLOG(3) << source->id()
-                        << " do layout_tranform from NCHWxc to NCHW";
+                        << " do layout_transform from NCHWxc to NCHW";
                 std::tie(trans_node, output_data) =
                     InsertLayoutTransformNodeAfter(
                         graph,
@@ -621,7 +622,7 @@ void AlterLayoutPass(Graph* graph) {
                         src_layout,
                         new_input_layouts[i],
                         cinn::common::UniqName(source->id() +
-                                               "_layout_tranform"));
+                                               "_layout_transform"));
                 UpdateInferInfos(trans_node,
                                  {input_shapes[i]},
                                  {input_types[i]},
@@ -709,7 +710,7 @@ void AlterLayoutPass(Graph* graph) {
                 src_layout,
                 dst_layout,
                 cinn::common::UniqName(node->op()->name +
-                                       "_final_layout_tranform"));
+                                       "_final_layout_transform"));
             shape_dict[temp_out->id()] = shape;
             type_dict[temp_out->id()] = type;
             layout_dict[temp_out->id()] = src_layout;

From 2304692225aa8fbdd309ad93d1a64761bd9f3b98 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:01:07 +0800
Subject: [PATCH 043/114] Update broadcast.cc (#62462)

* Update broadcast.cc

* Fix
---
 paddle/cinn/hlir/op/broadcast.cc   | 12 ++++++------
 paddle/cinn/hlir/op/elementwise.cc | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/paddle/cinn/hlir/op/broadcast.cc b/paddle/cinn/hlir/op/broadcast.cc
index d6df20f1a60eb..c6c7ee00a9449 100644
--- a/paddle/cinn/hlir/op/broadcast.cc
+++ b/paddle/cinn/hlir/op/broadcast.cc
@@ -545,16 +545,16 @@ StrategyForBinary(logical_right_shift, LogicalRightShift);
 }  // namespace cinn
 
 CINN_REGISTER_HELPER(broadcast_ops) {
-#define CINN_REGISTER_BINARY(op__, op_stragegy__)                              \
+#define CINN_REGISTER_BINARY(op__, op_strategy__)                              \
   CINN_REGISTER_OP(op__)                                                       \
       .describe(#op__ " function")                                             \
       .set_num_inputs(1)                                                       \
       .set_num_outputs(1)                                                      \
       .set_attr<cinn::hlir::framework::StrategyFunction>(                      \
-          "CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__)          \
+          "CINNStrategy", cinn::hlir::op::StrategyFor##op_strategy__)          \
       .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(              \
           "CINNStrategySymbolic",                                              \
-          cinn::hlir::op::StrategyFor##op_stragegy__##Symbolic)                \
+          cinn::hlir::op::StrategyFor##op_strategy__##Symbolic)                \
       .set_attr("infershape",                                                  \
                 MakeOpFunction(cinn::hlir::op::InferShapeForBroadcast))        \
       .set_attr("inferdtype",                                                  \
@@ -567,16 +567,16 @@ CINN_REGISTER_HELPER(broadcast_ops) {
           "OpPattern", cinn::hlir::framework::OpPatternKind::kBroadcast)       \
       .set_support_level(4);
 
-#define CINN_REGISTER_BINARY_CMP(op__, op_stragegy__)                      \
+#define CINN_REGISTER_BINARY_CMP(op__, op_strategy__)                      \
   CINN_REGISTER_OP(op__)                                                   \
       .describe(#op__ " function")                                         \
       .set_num_inputs(1)                                                   \
       .set_num_outputs(1)                                                  \
       .set_attr<cinn::hlir::framework::StrategyFunction>(                  \
-          "CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__)      \
+          "CINNStrategy", cinn::hlir::op::StrategyFor##op_strategy__)      \
       .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(          \
           "CINNStrategySymbolic",                                          \
-          cinn::hlir::op::StrategyFor##op_stragegy__##Symbolic)            \
+          cinn::hlir::op::StrategyFor##op_strategy__##Symbolic)            \
       .set_attr("infershape",                                              \
                 MakeOpFunction(cinn::hlir::op::InferShapeForBroadcast))    \
       .set_attr("inferdtype",                                              \
diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
index e547b7833a75f..0f39d26b49d92 100644
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -1326,16 +1326,16 @@ std::vector<Type> InferDtypeForLogicalNot(const std::vector<Type> &inputs_type,
 }  // namespace cinn
 
 CINN_REGISTER_HELPER(elementwise_ops) {
-#define CINN_REGISTER_UNARY(op__, op_stragegy__)                           \
+#define CINN_REGISTER_UNARY(op__, op_strategy__)                           \
   CINN_REGISTER_OP(op__)                                                   \
       .describe(#op__ " function")                                         \
       .set_num_inputs(1)                                                   \
       .set_num_outputs(1)                                                  \
       .set_attr<cinn::hlir::framework::StrategyFunction>(                  \
-          "CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__)      \
+          "CINNStrategy", cinn::hlir::op::StrategyFor##op_strategy__)      \
       .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(          \
           "CINNStrategySymbolic",                                          \
-          cinn::hlir::op::StrategyFor##op_stragegy__##Symbolic)            \
+          cinn::hlir::op::StrategyFor##op_strategy__##Symbolic)            \
       .set_attr("infershape",                                              \
                 MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))  \
       .set_attr("inferdtype",                                              \
@@ -1385,13 +1385,13 @@ CINN_REGISTER_HELPER(elementwise_ops) {
 
 #undef CINN_REGISTER_UNARY
 
-#define CINN_REGISTER_COMPARE(op__, op_stragegy__)                            \
+#define CINN_REGISTER_COMPARE(op__, op_strategy__)                            \
   CINN_REGISTER_OP(op__)                                                      \
       .describe(#op__ " function")                                            \
       .set_num_inputs(1)                                                      \
       .set_num_outputs(1)                                                     \
       .set_attr<cinn::hlir::framework::StrategyFunction>(                     \
-          "CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__)         \
+          "CINNStrategy", cinn::hlir::op::StrategyFor##op_strategy__)         \
       .set_attr("infershape",                                                 \
                 MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))     \
       .set_attr("inferdtype",                                                 \

From 2b7c7ff7fa2f221405a81a26447ad30b3c9b8164 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:01:39 +0800
Subject: [PATCH 044/114] Fix fellowing following, etc (#62453)

---
 .../group_merge/check_infer_symbolic_pass.cc         |  2 +-
 .../convert_dynamic_to_static_dim_pass.cc            |  8 ++++----
 .../convert_static_dim_to_dynamic_pass.cc            | 12 ++++++------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.cc
index 3ab2e8c7c7a3d..953e268b27a80 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/check_infer_symbolic_pass.cc
@@ -118,7 +118,7 @@ void CompareStaticAndDynamicValueShape(
   std::vector<std::vector<std::int64_t>> dynamic_value_shape =
       GetDynamicValueShape(value, shape_analysis);
   if (static_value_shape != dynamic_value_shape) {
-    VLOG(4) << "CheckInferSymbolic failed, in the fellowing program, the "
+    VLOG(4) << "CheckInferSymbolic failed, in the following program, the "
             << op_index
             << "th op : the shape is not equal\nthe static shape is: "
             << SprintShape(static_value_shape) << ", and the dynamic shape is: "
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.cc
index 21c5047c998c9..4a6458e8729b2 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_dynamic_to_static_dim_pass.cc
@@ -32,7 +32,7 @@ PD_DECLARE_string(cinn_convert_dynamic_dim_to_static_dim);
 namespace {
 
 template <typename DoEachT>
-void ForEachRawDyanmicToStaticDimPair(const DoEachT& DoEach) {
+void ForEachRawDynamicToStaticDimPair(const DoEachT& DoEach) {
   const std::string& env_var = FLAGS_cinn_convert_dynamic_dim_to_static_dim;
   size_t start = 0;
   while (true) {
@@ -43,7 +43,7 @@ void ForEachRawDyanmicToStaticDimPair(const DoEachT& DoEach) {
   }
 }
 
-std::optional<std::pair<std::string, int64_t>> ParseRawDyanmicToStaticDimPair(
+std::optional<std::pair<std::string, int64_t>> ParseRawDynamicToStaticDimPair(
     const std::string& raw_pair) {
   size_t pos = raw_pair.find(":", 0);
   if (pos == std::string::npos) return std::nullopt;
@@ -70,8 +70,8 @@ std::optional<std::pair<std::string, int64_t>> ParseRawDyanmicToStaticDimPair(
 
 std::unordered_map<std::string, int64_t> GetDynamicToStaticDimFlag() {
   std::unordered_map<std::string, int64_t> map;
-  ForEachRawDyanmicToStaticDimPair([&](const std::string& raw_pair) {
-    if (auto pair = ParseRawDyanmicToStaticDimPair(raw_pair)) {
+  ForEachRawDynamicToStaticDimPair([&](const std::string& raw_pair) {
+    if (auto pair = ParseRawDynamicToStaticDimPair(raw_pair)) {
       map.insert(pair.value());
     }
   });
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.cc
index dd6c2d2e74905..c38aeb9c03070 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_static_dim_to_dynamic_pass.cc
@@ -30,7 +30,7 @@ namespace cinn::dialect::ir {
 namespace {
 
 template <typename DoEachT>
-void ForEachRawStaticDimToDyanmicPair(const DoEachT& DoEach) {
+void ForEachRawStaticDimToDynamicPair(const DoEachT& DoEach) {
   const std::string& env_var = FLAGS_cinn_convert_static_dim_to_dynamic_dim;
   size_t start = 0;
   while (true) {
@@ -41,7 +41,7 @@ void ForEachRawStaticDimToDyanmicPair(const DoEachT& DoEach) {
   }
 }
 
-std::optional<std::pair<int64_t, std::string>> ParseRawStaticDimToDyanmicPair(
+std::optional<std::pair<int64_t, std::string>> ParseRawStaticDimToDynamicPair(
     const std::string& raw_pair) {
   size_t pos = raw_pair.find(":", 0);
   if (pos == std::string::npos) return std::nullopt;
@@ -66,10 +66,10 @@ std::optional<std::pair<int64_t, std::string>> ParseRawStaticDimToDyanmicPair(
   return std::pair{int64_t{constant}, symbol};
 }
 
-std::unordered_map<int64_t, std::string> GetStaticDimToDyanmicFromFlag() {
+std::unordered_map<int64_t, std::string> GetStaticDimToDynamicFromFlag() {
   std::unordered_map<int64_t, std::string> map;
-  ForEachRawStaticDimToDyanmicPair([&](const std::string& raw_pair) {
-    if (auto pair = ParseRawStaticDimToDyanmicPair(raw_pair)) {
+  ForEachRawStaticDimToDynamicPair([&](const std::string& raw_pair) {
+    if (auto pair = ParseRawStaticDimToDynamicPair(raw_pair)) {
       map.insert(pair.value());
     }
   });
@@ -81,7 +81,7 @@ using GlobalStaticDimToDynamicMapT =
 
 std::optional<GlobalStaticDimToDynamicMapT> CalcGlobalStaticDimToDynamicMap() {
   std::unordered_map<int64_t, std::string> map =
-      GetStaticDimToDyanmicFromFlag();
+      GetStaticDimToDynamicFromFlag();
   if (map.empty()) return std::nullopt;
   auto DividedByOther = [&](int64_t constant) {
     for (const auto& [other_constant, _] : map) {

From 1813177fd5fc2029301ef67f30008b1cc816bb55 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 7 Mar 2024 11:03:13 +0800
Subject: [PATCH 045/114] Fix uitls -> utils (#62496)

---
 .../interface/infer_symbolic_shape/cinn_op_infer_sym.cc     | 2 +-
 .../interface/infer_symbolic_shape/infer_sym_slice_utils.h  | 4 ++--
 .../interface/infer_symbolic_shape/paddle_op_infer_sym.cc   | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index d5da282de676b..f55dc321cefec 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -209,7 +209,7 @@ bool SliceOpInferSymbolicShape(pir::Operation *op,
 
   shape_analysis->SetShapeOrDataForValue(
       op->result(0),
-      paddle::dialect::slice_uitls::SliceRawInferSymbolicShape(
+      paddle::dialect::slice_utils::SliceRawInferSymbolicShape(
           shape_analysis->GetShapeOrDataForValue(op->operand_source(0)),
           starts,
           ends,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h
index 4e6a026748196..860cca51bcc96 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_slice_utils.h
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
 
-namespace paddle::dialect::slice_uitls {
+namespace paddle::dialect::slice_utils {
 
 inline ExprVec GetExprVecFromData(const ShapeOrData &shapeordata) {
   if (shapeordata.isa<TensorListExprs>()) {
@@ -188,4 +188,4 @@ inline ShapeOrData SliceRawInferSymbolicShape(
   return in_shapeordata.data().has_value() ? GetDataDimExprs()
                                            : GetShapeDimExprs();
 }
-}  // namespace paddle::dialect::slice_uitls
+}  // namespace paddle::dialect::slice_utils
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index 9192478548d51..eaa25c5d73dde 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -202,8 +202,8 @@ bool SliceOpInferSymbolicShape(pir::Operation *op,
   std::vector<int64_t> axes_vec = details::GetVectorAttr(op, "axes");
 
   // // Currently, we DO NOT support any element in `starts` is a Symbol.
-  ExprVec starts = slice_uitls::GetExprVecFromData(starts_shape_data);
-  ExprVec ends = slice_uitls::GetExprVecFromData(ends_shape_data);
+  ExprVec starts = slice_utils::GetExprVecFromData(starts_shape_data);
+  ExprVec ends = slice_utils::GetExprVecFromData(ends_shape_data);
 
   std::vector<int64_t> infer_flags = details::GetVectorAttr(op, "infer_flags");
 
@@ -212,7 +212,7 @@ bool SliceOpInferSymbolicShape(pir::Operation *op,
 
   shape_analysis->SetShapeOrDataForValue(
       res,
-      slice_uitls::SliceRawInferSymbolicShape(operand_shape_or_data,
+      slice_utils::SliceRawInferSymbolicShape(operand_shape_or_data,
                                               starts,
                                               ends,
                                               axes_vec,

From 21f4074a2905b8a47a2543fa3c016c6dcf06b1e3 Mon Sep 17 00:00:00 2001
From: Omri Alon <34627614+omri-alon24@users.noreply.github.com>
Date: Thu, 7 Mar 2024 05:08:41 +0200
Subject: [PATCH 046/114] Fix CWE 502 (#62345)

* change pickle load behavior

* remove

* f

* change to raise instead of print

* fix

* remove try catch

---------

Co-authored-by: Omri Alon <oalon@salesforce.com>
---
 python/paddle/static/io.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 3d3d4f30fa2d4..f4b61001a9fb6 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -142,6 +142,11 @@ def _clone_var_in_block(block, var):
         )
 
 
+def _safe_load_pickle(file, encoding="ASCII"):
+    load_dict = pickle.Unpickler(file, encoding=encoding).load()
+    return load_dict
+
+
 def prepend_feed_ops(
     inference_program, feed_target_names, feed_holder_name='feed'
 ):
@@ -1697,7 +1702,7 @@ def set_var(var, ndarray):
         if sys.platform == 'darwin' and sys.version_info.major == 3:
             load_dict = _pickle_loads_mac(parameter_file_name, f)
         else:
-            load_dict = pickle.load(f, encoding='latin1')
+            load_dict = _safe_load_pickle(f, encoding='latin1')
         load_dict = _pack_loaded_dict(load_dict)
     for v in parameter_list:
         assert (
@@ -1721,7 +1726,7 @@ def set_var(var, ndarray):
             )
 
         with open(opt_file_name, 'rb') as f:
-            load_dict = pickle.load(f, encoding='latin1')
+            load_dict = _safe_load_pickle(f, encoding='latin1')
         for v in optimizer_var_list:
             assert (
                 v.name in load_dict
@@ -2015,13 +2020,13 @@ def _load_vars_with_try_catch(
         if sys.platform == 'darwin' and sys.version_info.major == 3:
             para_dict = _pickle_loads_mac(parameter_file_name, f)
         else:
-            para_dict = pickle.load(f, encoding='latin1')
+            para_dict = _safe_load_pickle(f, encoding='latin1')
     para_dict = _pack_loaded_dict(para_dict)
 
     opt_file_name = model_prefix + ".pdopt"
     if os.path.exists(opt_file_name):
         with open(opt_file_name, 'rb') as f:
-            opti_dict = pickle.load(f, encoding='latin1')
+            opti_dict = _safe_load_pickle(f, encoding='latin1')
 
         para_dict.update(opti_dict)
 

From 88c79f1121bba6c8fe1a2a7000d17c94a5690e42 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Thu, 7 Mar 2024 11:18:08 +0800
Subject: [PATCH 047/114] [clang-tidy] NO.12 modernize-loop-convert (#61725)

* clangtidy 12

* fix

* fix

* fix
---
 ...ete_remove_padding_recover_padding_pass.cc | 10 ++-
 paddle/fluid/framework/ir/quantize_helper.cc  |  4 +-
 paddle/fluid/framework/program_desc.cc        | 27 +++----
 .../operator/utils/op_yaml_info_parser.cc     | 10 +--
 paddle/fluid/pir/transforms/inplace_pass.cc   |  4 +-
 .../pir/transforms/pd_op_to_kernel_pass.cc    |  6 +-
 .../profiler/dump/serialization_logger.cc     | 24 +++---
 .../dump/test_serialization_logger.cc         | 76 +++++++++----------
 paddle/fluid/platform/profiler/event_node.cc  |  6 +-
 .../fluid/platform/profiler/event_python.cc   | 26 +++----
 paddle/fluid/pybind/eval_frame_tools.cc       |  8 +-
 .../core/distributed/comm_context_manager.cc  |  8 +-
 paddle/phi/infermeta/spmd_rules/reduction.cc  |  6 +-
 paddle/phi/infermeta/spmd_rules/reshape.cc    |  3 +-
 paddle/phi/infermeta/spmd_rules/slice.cc      | 12 +--
 paddle/phi/infermeta/spmd_rules/unsqueeze.cc  | 12 +--
 paddle/phi/kernels/stride/slice_kernel.cc     |  3 +-
 .../kernels/stride/strided_slice_kernel.cc    |  4 +-
 test/cpp/fluid/save_load_combine_op_test.cc   |  4 +-
 test/cpp/fluid/save_load_op_test.cc           |  6 +-
 20 files changed, 124 insertions(+), 135 deletions(-)

diff --git a/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
index 7cea0e9f30ce8..48332f10094fa 100644
--- a/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
+++ b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
@@ -66,14 +66,16 @@ void DeleteRemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph *graph) const {
     std::unordered_set<const Node *> del_node_set;
 
     bool delete_recover_padding = true;
-    for (size_t i = 0; i < recover_padding_out->outputs.size(); ++i) {
+    for (size_t i = 0; i < recover_padding_out->outputs.size();
+         ++i) {  // NOLINT
       if (recover_padding_out->outputs[i]->Name() ==
           "remove_padding") {  // op_node
         auto *remove_padding_out_node =
-            recover_padding_out->outputs[i]->outputs[0];          // var_node
-        auto *out_op_node = remove_padding_out_node->outputs[0];  // op_node
+            recover_padding_out->outputs[i]->outputs[0];  // NOLINT // var_node
+        auto *out_op_node =
+            remove_padding_out_node->outputs[0];  // NOLINT // op_node
         IR_NODE_LINK_TO(recover_padding_input, out_op_node);
-        del_node_set.insert(recover_padding_out->outputs[i]);
+        del_node_set.insert(recover_padding_out->outputs[i]);  // NOLINT
         del_node_set.insert(remove_padding_out_node);
         out_op_node->Op()->RenameInput(remove_padding_out_node->Name(),
                                        recover_padding_input->Name());
diff --git a/paddle/fluid/framework/ir/quantize_helper.cc b/paddle/fluid/framework/ir/quantize_helper.cc
index fa72f4caf4433..c4b06651f1bbb 100644
--- a/paddle/fluid/framework/ir/quantize_helper.cc
+++ b/paddle/fluid/framework/ir/quantize_helper.cc
@@ -27,8 +27,8 @@ void SaveQuantInfoInTheGraph(
   if (!graph->Has(flag)) {
     graph->Set(flag, new bool(true));
   }
-  for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
-    graph->Set(iter->first + suffix, new std::vector<float>(iter->second));
+  for (const auto& iter : info_map) {
+    graph->Set(iter.first + suffix, new std::vector<float>(iter.second));
   }
 }
 
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index baf50d275c89f..512cdd9b38769 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -78,8 +78,8 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     // record all block desc's ptr from origin program
     old_block_desc.emplace_back(o.blocks_[i].get());
   }
-  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
-    auto all_ops = blocks_[block_id]->AllOps();
+  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {  // NOLINT
+    auto all_ops = blocks_[block_id]->AllOps();                       // NOLINT
     for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
       auto &op = all_ops[op_id];
 
@@ -92,7 +92,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
                         block_desc) != old_block_desc.end()) {
             // The block is owned by the origin program. Just use id to get
             // the corresponding block.
-            int sub_block_id = o.Block(block_id)
+            int sub_block_id = o.Block(block_id)  // NOLINT
                                    .Op(static_cast<int>(op_id))
                                    ->GetBlockAttrId(attr_name);
             op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
@@ -103,7 +103,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
             op->SetBlockAttr(attr_name, block_desc);
           }
         } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
-          std::vector<int> sub_block_ids = o.Block(block_id)
+          std::vector<int> sub_block_ids = o.Block(block_id)  // NOLINT
                                                .Op(static_cast<int>(op_id))
                                                ->GetBlocksAttrIds(attr_name);
           std::vector<BlockDesc *> block_descs;
@@ -114,19 +114,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
         } else if (op->GetAttrType(attr_name, true) == proto::AttrType::VAR) {
           VarDesc *var_desc =
               PADDLE_GET_CONST(VarDesc *, op->GetAttr(attr_name, true));
-          op->SetVarAttr(attr_name,
-                         o.Block(block_id).FindVarRecursive(var_desc->Name()));
+          op->SetVarAttr(
+              attr_name,
+              o.Block(block_id).FindVarRecursive(var_desc->Name()));  // NOLINT
         } else if (op->GetAttrType(attr_name, true) == proto::AttrType::VARS) {
           std::vector<VarDesc *> vars_desc = PADDLE_GET_CONST(
               std::vector<VarDesc *>, op->GetAttr(attr_name, true));
           std::vector<VarDesc *> new_vars_desc;
-          std::transform(
-              vars_desc.begin(),
-              vars_desc.end(),
-              std::back_inserter(new_vars_desc),
-              [&](VarDesc *var_desc) {
-                return o.Block(block_id).FindVarRecursive(var_desc->Name());
-              });
+          std::transform(vars_desc.begin(),
+                         vars_desc.end(),
+                         std::back_inserter(new_vars_desc),
+                         [&](VarDesc *var_desc) {
+                           return o.Block(block_id).FindVarRecursive(
+                               var_desc->Name());  // NOLINT
+                         });
           op->SetVarsAttr(attr_name, new_vars_desc);
         }
       }
diff --git a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
index 41140053a22f0..aeecd67bcf920 100644
--- a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
@@ -153,8 +153,8 @@ std::unordered_map<uint32_t, uint32_t> OpYamlInfoParser::GetInplaceIdMap()
 
 bool OpYamlInfoParser::HasView(const std::string& out_name) const {
   auto& view_info = std::get<3>(op_info_tuple_).view;
-  for (size_t i = 0; i < view_info.size(); i++) {
-    if (out_name == view_info[i].first) {
+  for (const auto& i : view_info) {
+    if (out_name == i.first) {
       return true;
     }
   }
@@ -164,9 +164,9 @@ bool OpYamlInfoParser::HasView(const std::string& out_name) const {
 const std::string& OpYamlInfoParser::ViewName(
     const std::string& out_name) const {
   auto& view_info = std::get<3>(op_info_tuple_).view;
-  for (size_t i = 0; i < view_info.size(); i++) {
-    if (out_name == view_info[i].first) {
-      return view_info[i].second;
+  for (const auto& i : view_info) {
+    if (out_name == i.first) {
+      return i.second;
     }
   }
   PADDLE_THROW(phi::errors::PreconditionNotMet(
diff --git a/paddle/fluid/pir/transforms/inplace_pass.cc b/paddle/fluid/pir/transforms/inplace_pass.cc
index b5574685bd113..5c9905a6bf75b 100644
--- a/paddle/fluid/pir/transforms/inplace_pass.cc
+++ b/paddle/fluid/pir/transforms/inplace_pass.cc
@@ -184,8 +184,8 @@ bool IsNoNeedBuffer(pir::Operation* op, pir::Value value) {
           info_interface->get_op_info_(op_name),
           paddle::dialect::IsLegacyOp(op_name));
       auto& no_need_buffer_ids = info_parser.NoNeedBufferIds();
-      for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
-        if (value == op->operand_source(no_need_buffer_ids[id])) {
+      for (auto no_need_buffer_id : no_need_buffer_ids) {
+        if (value == op->operand_source(no_need_buffer_id)) {
           return true;
         }
       }
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index c05e5de0daafa..53f259807fc38 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -643,8 +643,7 @@ static phi::DataType GetKernelDtypeByYaml(
   auto& data_type_info = op_info_parser->OpRuntimeInfo().kernel_key_dtype;
   phi::DataType kernel_data_type = phi::DataType::UNDEFINED;
 
-  for (size_t i = 0; i < data_type_info.size(); ++i) {
-    auto slot_name = data_type_info[i];
+  for (auto slot_name : data_type_info) {
     auto& input_map = op_info_parser->InputName2Id();
 
     bool is_complex_tag = false;
@@ -729,8 +728,7 @@ static phi::Backend GetKernelBackendByYaml(
   auto& backend_info = op_info_parser->OpRuntimeInfo().kernel_key_backend;
   phi::Backend kernel_backend = phi::Backend::UNDEFINED;
 
-  for (size_t i = 0; i < backend_info.size(); ++i) {
-    auto slot_name = backend_info[i];
+  for (auto slot_name : backend_info) {
     auto& input_map = op_info_parser->InputName2Id();
 
     if (input_map.count(slot_name)) {
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index 17c3d42ec5e86..e7889a6727199 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -103,37 +103,33 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
     current_thread_node_tree_proto_ =
         node_trees_proto_->add_thread_trees();  // add ThreadNodeTreeProto
     current_thread_node_tree_proto_->set_thread_id(event_node.first);
-    for (auto hostnode = event_node.second.begin();
-         hostnode != event_node.second.end();
-         ++hostnode) {
+    for (auto hostnode : event_node.second) {
       HostTraceEventNodeProto* host_node_proto =
           current_thread_node_tree_proto_
               ->add_host_nodes();  // add HostTraceEventNodeProto
-      host_node_proto->set_id(node_index_map[(*hostnode)]);
-      host_node_proto->set_parentid(node_parent_map[(*hostnode)]);
+      host_node_proto->set_id(node_index_map[hostnode]);
+      host_node_proto->set_parentid(node_parent_map[hostnode]);
       current_host_trace_event_node_proto_ =
-          host_node_proto;       // set current HostTraceEventNodeProto
-      (*hostnode)->LogMe(this);  // fill detail information
+          host_node_proto;    // set current HostTraceEventNodeProto
+      hostnode->LogMe(this);  // fill detail information
 
-      for (auto runtimenode : (*hostnode)->GetRuntimeTraceEventNodes()) {
+      for (auto runtimenode : hostnode->GetRuntimeTraceEventNodes()) {
         CudaRuntimeTraceEventNodeProto* runtime_node_proto =
             current_host_trace_event_node_proto_
                 ->add_runtime_nodes();  // add CudaRuntimeTraceEventNodeProto
         current_runtime_trace_event_node_proto_ =
             runtime_node_proto;    // set current CudaRuntimeTraceEventNodeProto
         runtimenode->LogMe(this);  // fill detail information
-        for (auto devicenode = runtimenode->GetDeviceTraceEventNodes().begin();
-             devicenode != runtimenode->GetDeviceTraceEventNodes().end();
-             ++devicenode) {
+        for (auto devicenode : runtimenode->GetDeviceTraceEventNodes()) {
           DeviceTraceEventNodeProto* device_node_proto =
               current_runtime_trace_event_node_proto_
                   ->add_device_nodes();  // add DeviceTraceEventNodeProto
           current_device_trace_event_node_proto_ =
-              device_node_proto;       // set current DeviceTraceEventNodeProto
-          (*devicenode)->LogMe(this);  // fill detail information
+              device_node_proto;    // set current DeviceTraceEventNodeProto
+          devicenode->LogMe(this);  // fill detail information
         }
       }
-      for (auto memnode : (*hostnode)->GetMemTraceEventNodes()) {
+      for (auto memnode : hostnode->GetMemTraceEventNodes()) {
         MemTraceEventNodeProto* mem_node_proto =
             current_host_trace_event_node_proto_->add_mem_nodes();
         current_mem_trace_event_node_proto_ = mem_node_proto;
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index bc9407684bcd8..4872d7bb42353 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -152,21 +152,21 @@ TEST(SerializationLoggerTest, dump_case0) {
   EXPECT_EQ(nodes[11].size(), 2u);
   std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
   std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
-  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
-    if ((*it)->Name() == "root node") {
-      EXPECT_EQ((*it)->GetChildren().size(), 3u);
+  for (auto& thread1_node : thread1_nodes) {
+    if (thread1_node->Name() == "root node") {
+      EXPECT_EQ(thread1_node->GetChildren().size(), 3u);
     }
-    if ((*it)->Name() == "op1") {
-      EXPECT_EQ((*it)->GetChildren().size(), 0u);
-      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
-      EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
-      EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
+    if (thread1_node->Name() == "op1") {
+      EXPECT_EQ(thread1_node->GetChildren().size(), 0u);
+      EXPECT_EQ(thread1_node->GetRuntimeTraceEventNodes().size(), 2u);
+      EXPECT_EQ(thread1_node->GetMemTraceEventNodes().size(), 2u);
+      EXPECT_NE(thread1_node->GetOperatorSupplementEventNode(), nullptr);
     }
   }
-  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
-    if ((*it)->Name() == "op3") {
-      EXPECT_EQ((*it)->GetChildren().size(), 0u);
-      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+  for (auto& thread2_node : thread2_nodes) {
+    if (thread2_node->Name() == "op3") {
+      EXPECT_EQ(thread2_node->GetChildren().size(), 0u);
+      EXPECT_EQ(thread2_node->GetRuntimeTraceEventNodes().size(), 2u);
     }
   }
   tree.LogMe(&logger);
@@ -247,15 +247,15 @@ TEST(SerializationLoggerTest, dump_case1) {
   EXPECT_EQ(nodes[11].size(), 1u);
   std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
   std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
-  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
-    if ((*it)->Name() == "root node") {
-      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
+  for (auto& thread1_node : thread1_nodes) {
+    if (thread1_node->Name() == "root node") {
+      EXPECT_EQ(thread1_node->GetRuntimeTraceEventNodes().size(), 3u);
     }
   }
-  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
-    if ((*it)->Name() == "root node") {
-      EXPECT_EQ((*it)->GetChildren().size(), 0u);
-      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+  for (auto& thread2_node : thread2_nodes) {
+    if (thread2_node->Name() == "root node") {
+      EXPECT_EQ(thread2_node->GetChildren().size(), 0u);
+      EXPECT_EQ(thread2_node->GetRuntimeTraceEventNodes().size(), 2u);
     }
   }
   tree.LogMe(&logger);
@@ -272,21 +272,21 @@ TEST(DeserializationReaderTest, restore_case0) {
   EXPECT_EQ(nodes[11].size(), 2u);
   std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
   std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
-  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
-    if ((*it)->Name() == "root node") {
-      EXPECT_EQ((*it)->GetChildren().size(), 3u);
+  for (auto& thread1_node : thread1_nodes) {
+    if (thread1_node->Name() == "root node") {
+      EXPECT_EQ(thread1_node->GetChildren().size(), 3u);
     }
-    if ((*it)->Name() == "op1") {
-      EXPECT_EQ((*it)->GetChildren().size(), 0u);
-      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
-      EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
-      EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
+    if (thread1_node->Name() == "op1") {
+      EXPECT_EQ(thread1_node->GetChildren().size(), 0u);
+      EXPECT_EQ(thread1_node->GetRuntimeTraceEventNodes().size(), 2u);
+      EXPECT_EQ(thread1_node->GetMemTraceEventNodes().size(), 2u);
+      EXPECT_NE(thread1_node->GetOperatorSupplementEventNode(), nullptr);
     }
   }
-  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
-    if ((*it)->Name() == "op3") {
-      EXPECT_EQ((*it)->GetChildren().size(), 0u);
-      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+  for (auto& thread2_node : thread2_nodes) {
+    if (thread2_node->Name() == "op3") {
+      EXPECT_EQ(thread2_node->GetChildren().size(), 0u);
+      EXPECT_EQ(thread2_node->GetRuntimeTraceEventNodes().size(), 2u);
     }
   }
 }
@@ -301,15 +301,15 @@ TEST(DeserializationReaderTest, restore_case1) {
   EXPECT_EQ(nodes[11].size(), 1u);
   std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
   std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
-  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
-    if ((*it)->Name() == "root node") {
-      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
+  for (auto& thread1_node : thread1_nodes) {
+    if (thread1_node->Name() == "root node") {
+      EXPECT_EQ(thread1_node->GetRuntimeTraceEventNodes().size(), 3u);
     }
   }
-  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
-    if ((*it)->Name() == "root node") {
-      EXPECT_EQ((*it)->GetChildren().size(), 0u);
-      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+  for (auto& thread2_node : thread2_nodes) {
+    if (thread2_node->Name() == "root node") {
+      EXPECT_EQ(thread2_node->GetChildren().size(), 0u);
+      EXPECT_EQ(thread2_node->GetRuntimeTraceEventNodes().size(), 2u);
     }
   }
 }
diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc
index c92ae133814f3..3c37dbf39fef4 100644
--- a/paddle/fluid/platform/profiler/event_node.cc
+++ b/paddle/fluid/platform/profiler/event_node.cc
@@ -434,10 +434,8 @@ void NodeTrees::HandleTrees(
       }
       for (auto event_node : (*hostnode)->GetRuntimeTraceEventNodes()) {
         runtime_event_node_handle(event_node);
-        for (auto devicenode = event_node->GetDeviceTraceEventNodes().begin();
-             devicenode != event_node->GetDeviceTraceEventNodes().end();
-             ++devicenode) {
-          device_event_node_handle(*devicenode);
+        for (auto devicenode : event_node->GetDeviceTraceEventNodes()) {
+          device_event_node_handle(devicenode);
         }
       }
       for (auto event_node : (*hostnode)->GetMemTraceEventNodes()) {
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index c01b4abcfbbd3..551cdd2182323 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -63,20 +63,18 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
     runtime_python_node->correlation_id = runtimenode->CorrelationId();
     host_python_node->runtime_node_ptrs.push_back(runtime_python_node);
     // copy DeviceTraceEventNode
-    for (auto devicenode = runtimenode->GetDeviceTraceEventNodes().begin();
-         devicenode != runtimenode->GetDeviceTraceEventNodes().end();
-         ++devicenode) {
+    for (auto devicenode : runtimenode->GetDeviceTraceEventNodes()) {
       DevicePythonNode* device_python_node = new DevicePythonNode();
-      device_python_node->name = (*devicenode)->Name();
-      device_python_node->type = (*devicenode)->Type();
-      device_python_node->start_ns = (*devicenode)->StartNs();
-      device_python_node->end_ns = (*devicenode)->EndNs();
-      device_python_node->device_id = (*devicenode)->DeviceId();
-      device_python_node->context_id = (*devicenode)->ContextId();
-      device_python_node->stream_id = (*devicenode)->StreamId();
-      device_python_node->correlation_id = (*devicenode)->CorrelationId();
+      device_python_node->name = devicenode->Name();
+      device_python_node->type = devicenode->Type();
+      device_python_node->start_ns = devicenode->StartNs();
+      device_python_node->end_ns = devicenode->EndNs();
+      device_python_node->device_id = devicenode->DeviceId();
+      device_python_node->context_id = devicenode->ContextId();
+      device_python_node->stream_id = devicenode->StreamId();
+      device_python_node->correlation_id = devicenode->CorrelationId();
       if (device_python_node->type == TracerEventType::Kernel) {
-        KernelEventInfo kernel_info = (*devicenode)->KernelInfo();
+        KernelEventInfo kernel_info = devicenode->KernelInfo();
         device_python_node->block_x = kernel_info.block_x;
         device_python_node->block_y = kernel_info.block_y;
         device_python_node->block_z = kernel_info.block_z;
@@ -91,10 +89,10 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
         device_python_node->warps_per_sm = kernel_info.warps_per_sm;
         device_python_node->occupancy = kernel_info.occupancy;
       } else if (device_python_node->type == TracerEventType::Memcpy) {
-        MemcpyEventInfo memcpy_info = (*devicenode)->MemcpyInfo();
+        MemcpyEventInfo memcpy_info = devicenode->MemcpyInfo();
         device_python_node->num_bytes = memcpy_info.num_bytes;
       } else if (device_python_node->type == TracerEventType::Memset) {
-        MemsetEventInfo memset_info = (*devicenode)->MemsetInfo();
+        MemsetEventInfo memset_info = devicenode->MemsetInfo();
         device_python_node->num_bytes = memset_info.num_bytes;
         device_python_node->value = memset_info.value;
       }
diff --git a/paddle/fluid/pybind/eval_frame_tools.cc b/paddle/fluid/pybind/eval_frame_tools.cc
index 504dbc5b9fa01..f0209f90610ee 100644
--- a/paddle/fluid/pybind/eval_frame_tools.cc
+++ b/paddle/fluid/pybind/eval_frame_tools.cc
@@ -38,8 +38,8 @@ class TreeNode {
 };
 
 void TreeNode::clear() {
-  for (int i = 0; i < 256; i++) {
-    if (children[i] != nullptr) delete children[i];
+  for (auto& i : children) {
+    if (i != nullptr) delete i;
   }
 }
 
@@ -200,8 +200,8 @@ void CodeStatus::add_with_graph_code(PyCodeObject* code) {
 }
 
 void CodeStatus::clear() {
-  for (auto iter = code_map.begin(); iter != code_map.end(); iter++) {
-    delete iter->second;
+  for (auto& iter : code_map) {
+    delete iter.second;
   }
   code_map.clear();
 }
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index 01ffd15f79d28..9e3be85222c61 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -234,12 +234,10 @@ CommContext* CommContextManager::Get(const std::string& unique_comm_key) const {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 int CommContextManager::GetRingId(const ncclComm_t& comm) const {
-  for (auto iter = id_to_comm_context_.begin();
-       iter != id_to_comm_context_.end();
-       ++iter) {
-    if (static_cast<phi::distributed::NCCLCommContext*>(iter->second.get())
+  for (const auto& iter : id_to_comm_context_) {
+    if (static_cast<phi::distributed::NCCLCommContext*>(iter.second.get())
             ->GetNcclComm() == comm) {
-      return std::stoi(iter->first);
+      return std::stoi(iter.first);
     }
   }
   return -1;
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.cc b/paddle/phi/infermeta/spmd_rules/reduction.cc
index ef5d93a04533e..96e9230fb9182 100644
--- a/paddle/phi/infermeta/spmd_rules/reduction.cc
+++ b/paddle/phi/infermeta/spmd_rules/reduction.cc
@@ -238,9 +238,9 @@ SpmdInfo ReductionGradInferSpmd(const DistMetaTensor& x,
     auto dims_mapping = x_dist_attr.dims_mapping();
     auto axis_value = axis.GetData();
 
-    for (size_t i = 0; i < axis_value.size(); ++i) {
-      if (axis_value[i] < 0) {
-        axis_value[i] += x_dim.size();  // NOLINT
+    for (auto& i : axis_value) {
+      if (i < 0) {
+        i += x_dim.size();
       }
     }
     std::sort(axis_value.begin(), axis_value.end());
diff --git a/paddle/phi/infermeta/spmd_rules/reshape.cc b/paddle/phi/infermeta/spmd_rules/reshape.cc
index 2e8d79e14bf49..9ca886f0dc637 100644
--- a/paddle/phi/infermeta/spmd_rules/reshape.cc
+++ b/paddle/phi/infermeta/spmd_rules/reshape.cc
@@ -122,8 +122,7 @@ std::vector<std::shared_ptr<DimTrans>> MakeReshapeDimTrans(
 
     if (!tgt_splitted_shape.empty()) {
       std::vector<std::shared_ptr<DimTrans>> input_dims;
-      for (int i = 0, n = static_cast<int>(src_dims.size()); i < n; i++) {
-        int64_t in_dim = src_dims[i];
+      for (auto in_dim : src_dims) {
         if (src_shape[in_dim] > 1) {
           input_dims.emplace_back(std::make_shared<InputDim>(in_dim));
         }
diff --git a/paddle/phi/infermeta/spmd_rules/slice.cc b/paddle/phi/infermeta/spmd_rules/slice.cc
index 3615e57340a0d..9daed3ce8c764 100644
--- a/paddle/phi/infermeta/spmd_rules/slice.cc
+++ b/paddle/phi/infermeta/spmd_rules/slice.cc
@@ -77,8 +77,8 @@ SpmdInfo SliceInferSpmdBase(const DistMetaTensor& input,
   // cannot be sharded, if it is sharded, set it to replicated.
   TensorDistAttr input_dist_attr_dst =
       CopyTensorDistAttrForOutput(input_dist_attr_src);
-  for (int i = 0; i < static_cast<int>(axes.size()); i++) {
-    int axis = axes[i] < 0 ? axes[i] + input_ndim : axes[i];  // NOLINT
+  for (auto axe : axes) {
+    int axis = axe < 0 ? axe + input_ndim : axe;
     input_dims_mapping[axis] = -1;
   }
   input_dist_attr_dst.set_dims_mapping(input_dims_mapping);
@@ -164,8 +164,8 @@ SpmdInfo SliceInferSpmdReverseBase(const DistMetaTensor& input,
     out_axes[i] = input_axes[input_axis];
   }
 
-  for (int i = 0; i < static_cast<int>(axes.size()); i++) {
-    int axis = axes[i] < 0 ? axes[i] + input_ndim : axes[i];  // NOLINT
+  for (auto axe : axes) {
+    int axis = axe < 0 ? axe + input_ndim : axe;
     // the sliced axis cannot be sharded, set its notation
     // with the special '1' to set its dim mapping to -1.
     input_axes[axis] = '1';
@@ -190,8 +190,8 @@ SpmdInfo SliceInferSpmdReverseBase(const DistMetaTensor& input,
   // step2.3 get new dist attribute for output. the sliced
   // cannot be sharded, if it is sharded, set it to replicated.
   out_dims_mapping = GetDimsMappingForAxes(out_axes, axis_to_dim_map, true);
-  for (int i = 0; i < static_cast<int>(axes.size()); i++) {
-    int axis = axes[i] < 0 ? axes[i] + input_ndim : axes[i];
+  for (auto axe : axes) {
+    int axis = axe < 0 ? axe + input_ndim : axe;
     out_dims_mapping[axis] = -1;
   }
   auto out_dist_attr_dst = CopyTensorDistAttrForOutput(out_dist_attr);
diff --git a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
index 5521e1ba2a137..f7e16d4bb33da 100644
--- a/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
+++ b/paddle/phi/infermeta/spmd_rules/unsqueeze.cc
@@ -110,9 +110,9 @@ SpmdInfo UnsqueezeInferSpmd(const DistMetaTensor& x,
   std::vector<int64_t> out_shape;
   std::vector<int64_t> axis_copy(axis);
 
-  for (int64_t i = 0; i < static_cast<int64_t>(axis_copy.size()); i++) {
-    if (axis_copy[i] < 0) {
-      axis_copy[i] += x_ndim + 1;
+  for (auto& i : axis_copy) {
+    if (i < 0) {
+      i += x_ndim + 1;
     }
   }
 
@@ -183,9 +183,9 @@ SpmdInfo UnsqueezeInferSpmdReverse(const DistMetaTensor& x,
 
   std::vector<int64_t> axis_copy(axis);
 
-  for (int64_t i = 0; i < static_cast<int64_t>(axis_copy.size()); i++) {
-    if (axis_copy[i] < 0) {
-      axis_copy[i] += x_ndim + 1;
+  for (auto& i : axis_copy) {
+    if (i < 0) {
+      i += x_ndim + 1;
     }
   }
 
diff --git a/paddle/phi/kernels/stride/slice_kernel.cc b/paddle/phi/kernels/stride/slice_kernel.cc
index 3e21360ce09d0..132fb30c314aa 100644
--- a/paddle/phi/kernels/stride/slice_kernel.cc
+++ b/paddle/phi/kernels/stride/slice_kernel.cc
@@ -59,8 +59,7 @@ void SliceStridedKernel(const Context& ctx,
 
   std::vector<uint8_t> decrease_flag(output_dims.size(), 0);
   if (!decrease_axis.empty()) {
-    for (int i = 0; i < static_cast<int>(decrease_axis.size()); ++i) {
-      int64_t axis = decrease_axis[i];
+    for (auto axis : decrease_axis) {
       decrease_flag[axis] = 1;
     }
 
diff --git a/paddle/phi/kernels/stride/strided_slice_kernel.cc b/paddle/phi/kernels/stride/strided_slice_kernel.cc
index f3b36565def3e..e40a094573ab1 100644
--- a/paddle/phi/kernels/stride/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/stride/strided_slice_kernel.cc
@@ -93,8 +93,8 @@ void StridedSliceRawStridedKernel(const Context& dev_ctx,
   if (!decrease_axis.empty()) {
     std::vector<int64_t> new_out_shape;
     std::vector<int64_t> new_out_stride;
-    for (size_t i = 0; i < decrease_axis.size(); ++i) {
-      output_dims[decrease_axis[i]] = 0;
+    for (auto de_axis : decrease_axis) {
+      output_dims[de_axis] = 0;
     }
 
     for (size_t i = 0; i < output_dims.size(); ++i) {
diff --git a/test/cpp/fluid/save_load_combine_op_test.cc b/test/cpp/fluid/save_load_combine_op_test.cc
index f97409d6535ab..a559ed077cb62 100644
--- a/test/cpp/fluid/save_load_combine_op_test.cc
+++ b/test/cpp/fluid/save_load_combine_op_test.cc
@@ -72,7 +72,7 @@ void CheckValues(T* expect,
     EXPECT_EQ(expect[i], static_cast<T>(actual[i]));
   }
   EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
+  for (size_t i = 0; i < expect_lod.size(); ++i) {  // NOLINT
     for (size_t j = 0; j < expect_lod[i].size(); ++j) {
       EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
     }
@@ -362,7 +362,7 @@ TEST(SaveLoadTestWithCombineOp, CPU) {
   }
   auto& actual_lod = target->lod();
   EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
+  for (size_t i = 0; i < expect_lod.size(); ++i) {  // NOLINT
     for (size_t j = 0; j < expect_lod[i].size(); ++j) {
       EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
     }
diff --git a/test/cpp/fluid/save_load_op_test.cc b/test/cpp/fluid/save_load_op_test.cc
index 5ddb0afb03616..abd7548f81e6f 100644
--- a/test/cpp/fluid/save_load_op_test.cc
+++ b/test/cpp/fluid/save_load_op_test.cc
@@ -58,7 +58,7 @@ TEST(SaveLoadOp, CPU) {
   }
   auto& actual_lod = target->lod();
   EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
+  for (size_t i = 0; i < expect_lod.size(); ++i) {  // NOLINT
     for (size_t j = 0; j < expect_lod[i].size(); ++j) {
       EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
     }
@@ -141,7 +141,7 @@ TEST(SaveFP16Op, CPU) {
   }
   auto& actual_lod = target->lod();
   EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
+  for (size_t i = 0; i < expect_lod.size(); ++i) {  // NOLINT
     for (size_t j = 0; j < expect_lod[i].size(); ++j) {
       EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
     }
@@ -191,7 +191,7 @@ TEST(LoadFP16Op, CPU) {
 
   auto& actual_lod = target.lod();
   EXPECT_EQ(expect_lod.size(), actual_lod.size());
-  for (size_t i = 0; i < expect_lod.size(); ++i) {
+  for (size_t i = 0; i < expect_lod.size(); ++i) {  // NOLINT
     for (size_t j = 0; j < expect_lod[i].size(); ++j) {
       EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
     }

From 3cb3f4dbdea8457a48b535524b98ba8fceb953f6 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Thu, 7 Mar 2024 11:33:46 +0800
Subject: [PATCH 048/114] [PIR] Remove duplicate error message in executor log
 warning (#62479)

---
 paddle/fluid/framework/new_executor/pir_interpreter.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 52608af201d1e..3e5f491986971 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -1789,13 +1789,13 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
     framework::InsertCallStackInfo(op->name(), op_callstack_attr, &ex);
     LOG(WARNING) << " OP id:" << instr_node->Id() << " " << instr_node->Name()
                  << " raises an EnforceNotMet exception "
-                 << platform::demangle(typeid(ex).name()) << ", " << ex.what();
+                 << platform::demangle(typeid(ex).name());
     exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
   } catch (platform::EOFException&) {
     exception_holder_.Catch(std::current_exception());
   } catch (std::exception& ex) {
     LOG(WARNING) << instr_node->Name() << " raises an exception "
-                 << platform::demangle(typeid(ex).name()) << ", " << ex.what();
+                 << platform::demangle(typeid(ex).name());
     exception_holder_.Catch(std::current_exception());
   } catch (...) {
     LOG(WARNING) << instr_node->Name() << " raises an unknown exception";

From b90de4d2596b954cfbc43df012fd01e360ebe049 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 7 Mar 2024 12:14:47 +0800
Subject: [PATCH 049/114] [PIR] pir onednn support conv2d_transpose (#61165)

* pir onednn support conv2d_transpose
---
 .../fluid/inference/api/analysis_predictor.cc |   4 +
 paddle/fluid/pir/dialect/operator/ir/ops.yaml |  10 +
 .../dialect/operator/ir/ops_onednn_extra.yaml |   8 +
 .../fluid/pir/drr/src/ir_operation_factory.cc | 111 +++++++++++
 .../transforms/onednn/conv_bias_fuse_pass.cc  | 186 ++++++++++++++++--
 .../test_convtranspose_bias_fuse_pass.py      | 163 +++++++++++++++
 .../test_conv2d_transpose_bf16_mkldnn_op.py   |   2 +-
 7 files changed, 466 insertions(+), 18 deletions(-)
 create mode 100644 test/ir/pir/fused_pass/onednn/test_convtranspose_bias_fuse_pass.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 08e3193ce4365..ef576b3527c3b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -80,6 +80,7 @@
 
 #ifdef PADDLE_WITH_DNNL
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
+#include "paddle/fluid/pir/transforms/onednn/batch_norm_act_fuse_pass.h"
 #include "paddle/fluid/pir/transforms/onednn/conv_bias_fuse_pass.h"
 #endif
 
@@ -979,6 +980,9 @@ bool AnalysisPredictor::PrepareExecutor() {
         ::pir::PassManager mkldnn_pm(::pir::IrContext::Instance(), 2);
 
         mkldnn_pm.AddPass(::pir::CreateConv2dBiasFusePass());
+        mkldnn_pm.AddPass(::pir::CreateConv2dTransposeBiasFusePass());
+        mkldnn_pm.AddPass(::pir::CreateConv3dBiasFusePass());
+        mkldnn_pm.AddPass(::pir::CreateBatchNormActFusePass());
 
         auto constant_folding_pass = ::pir::CreateConstantFoldingPass();
         constant_folding_pass->SetNotOwned(pir::kPlaceAttr, &place_);
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 616695fad5149..9cc328dbe24fb 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -355,6 +355,16 @@
     data_type : x
   backward : conv2d_transpose_grad
 
+- op : conv2d_transpose_bias
+  args : (Tensor x, Tensor filter, Tensor bias, int[] strides={1, 1}, int[] paddings={0, 0}, int[] output_padding={}, IntArray output_size={}, str padding_algorithm="EXPLICIT", int groups=1, int[] dilations={1, 1}, str data_format="NCHW")
+  output : Tensor(out)
+  infer_meta :
+    func : Conv2dTransposeInferMeta
+    param: [x, filter, strides, paddings, output_padding, output_size, padding_algorithm, groups, dilations, data_format]
+  kernel :
+    func : conv2d_transpose_bias
+    data_type : x
+
 - op : copy_to
   args : (Tensor x, Place place, bool blocking)
   output : Tensor(out)
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
index 2e16dfce8cacf..f13b066d335be 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops_onednn_extra.yaml
@@ -52,6 +52,14 @@
   extra_args : bool is_test=false
   data_format_tensors : input, out_grad
 
+- op : conv2d_transpose
+  extra_args : bool is_test=false
+  data_format_tensors : x
+
+- op : conv2d_transpose_bias
+  extra_args : bool is_test=false, bool force_fp32_output = false, str mkldnn_data_type = "float32", bool fuse_relu = false, str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f
+  data_format_tensors : x
+
 - op : conv3d
   extra_args : bool is_test=false
   data_format_tensors : input
diff --git a/paddle/fluid/pir/drr/src/ir_operation_factory.cc b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
index bfe97d45592f7..de796c50e67d3 100644
--- a/paddle/fluid/pir/drr/src/ir_operation_factory.cc
+++ b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
@@ -23,6 +23,9 @@
 #include "paddle/pir/include/core/builtin_op.h"
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/include/core/value.h"
+#ifdef PADDLE_WITH_DNNL
+#include "paddle/fluid/pir/dialect/operator/ir/onednn_op.h"
+#endif
 
 namespace paddle {
 namespace drr {
@@ -61,6 +64,114 @@ void OperationFactory::RegisterManualOpCreator() {
             attrs.at("bias").dyn_cast<pir::FloatAttribute>().data(),
             attrs.at("bias_after_scale").dyn_cast<pir::BoolAttribute>().data());
       });
+
+#ifdef PADDLE_WITH_DNNL
+  op_creator_map["onednn_op.conv2d_transpose_bias"] =
+      [](const std::vector<pir::Value>& inputs,
+         const pir::AttributeMap& attrs,
+         pir::PatternRewriter& rewriter) {
+        if (inputs.size() == 4) {
+          IR_ENFORCE(
+              attrs.find("strides") != attrs.end(),
+              "'strides' Attribute is expected for Conv2dTransposeBiasOp. ");
+          std::vector<int> strides;
+          for (size_t i = 0;
+               i < attrs.at("strides").dyn_cast<pir::ArrayAttribute>().size();
+               i++) {
+            strides.push_back(attrs.at("strides")
+                                  .dyn_cast<pir::ArrayAttribute>()
+                                  .at(i)
+                                  .dyn_cast<pir::Int32Attribute>()
+                                  .data());
+          }
+
+          IR_ENFORCE(
+              attrs.find("paddings") != attrs.end(),
+              "'paddings' Attribute is expected for Conv2dTransposeBiasOp. ");
+          std::vector<int> paddings;
+          for (size_t i = 0;
+               i < attrs.at("paddings").dyn_cast<pir::ArrayAttribute>().size();
+               i++) {
+            paddings.push_back(attrs.at("paddings")
+                                   .dyn_cast<pir::ArrayAttribute>()
+                                   .at(i)
+                                   .dyn_cast<pir::Int32Attribute>()
+                                   .data());
+          }
+
+          IR_ENFORCE(attrs.find("output_padding") != attrs.end(),
+                     "'output_padding' Attribute is expected for "
+                     "Conv2dTransposeBiasOp. ");
+          std::vector<int> output_padding;
+          for (size_t i = 0; i < attrs.at("output_padding")
+                                     .dyn_cast<pir::ArrayAttribute>()
+                                     .size();
+               i++) {
+            output_padding.push_back(attrs.at("output_padding")
+                                         .dyn_cast<pir::ArrayAttribute>()
+                                         .at(i)
+                                         .dyn_cast<pir::Int32Attribute>()
+                                         .data());
+          }
+
+          IR_ENFORCE(attrs.find("padding_algorithm") != attrs.end(),
+                     "'padding_algorithm' Attribute is expected for "
+                     "Conv2dTransposeBiasOp. ");
+          std::string padding_algorithm = attrs.at("padding_algorithm")
+                                              .dyn_cast<pir::StrAttribute>()
+                                              .AsString();
+
+          IR_ENFORCE(
+              attrs.find("groups") != attrs.end(),
+              "'groups' Attribute is expected for Conv2dTransposeBiasOp. ");
+          int groups =
+              attrs.at("groups").dyn_cast<pir::Int32Attribute>().data();
+
+          IR_ENFORCE(
+              attrs.find("dilations") != attrs.end(),
+              "'dilations' Attribute is expected for Conv2dTransposeBiasOp. ");
+          std::vector<int> dilations;
+          for (size_t i = 0;
+               i < attrs.at("dilations").dyn_cast<pir::ArrayAttribute>().size();
+               i++) {
+            dilations.push_back(attrs.at("dilations")
+                                    .dyn_cast<pir::ArrayAttribute>()
+                                    .at(i)
+                                    .dyn_cast<pir::Int32Attribute>()
+                                    .data());
+          }
+
+          IR_ENFORCE(attrs.find("data_format") != attrs.end(),
+                     "'data_format' Attribute is expected for "
+                     "Conv2dTransposeBiasOp. ");
+          std::string data_format =
+              attrs.at("data_format").dyn_cast<pir::StrAttribute>().AsString();
+
+          IR_ENFORCE(
+              attrs.find("is_test") != attrs.end(),
+              "'is_test' Attribute is expected for Conv2dTransposeBiasOp. ");
+          bool is_test =
+              attrs.at("is_test").dyn_cast<pir::BoolAttribute>().data();
+
+          return rewriter.Build<paddle::onednn::dialect::Conv2dTransposeBiasOp>(
+              inputs[0],
+              inputs[1],
+              inputs[2],
+              inputs[3],
+              strides,
+              paddings,
+              output_padding,
+              padding_algorithm,
+              groups,
+              dilations,
+              data_format,
+              is_test);
+        }
+
+        return rewriter.Build<paddle::onednn::dialect::Conv2dTransposeBiasOp>(
+            inputs[0], inputs[1], inputs[2], attrs);
+      };
+#endif
 }
 
 pir::Attribute CreateIrAttribute(const std::any& obj) {
diff --git a/paddle/fluid/pir/transforms/onednn/conv_bias_fuse_pass.cc b/paddle/fluid/pir/transforms/onednn/conv_bias_fuse_pass.cc
index 67177d9cee390..bd60a9302f1d6 100644
--- a/paddle/fluid/pir/transforms/onednn/conv_bias_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/onednn/conv_bias_fuse_pass.cc
@@ -222,6 +222,157 @@ class FusedConvAddFusePattern : public paddle::drr::DrrPatternBase {
   }
 };
 
+class ConvTransposeBiasFusePattern : public paddle::drr::DrrPatternBase {
+  std::string name() const override { return "ConvTransposeBiasFusePattern"; }
+
+  uint32_t benefit() const override { return 2; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+
+    const auto &conv =
+        pat.Op(paddle::dialect::Conv2dTransposeOp::name(),
+               {{"strides", pat.Attr("strides")},
+                {"paddings", pat.Attr("paddings")},
+                {"output_padding", pat.Attr("output_padding")},
+                {"padding_algorithm", pat.Attr("padding_algorithm")},
+                {"dilations", pat.Attr("dilations")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")}});
+
+    const auto &add = pat.Op(paddle::dialect::AddOp::name());
+    conv({&pat.Tensor("input"),
+          &pat.Tensor("filter"),
+          &pat.Tensor("output_size")},
+         {&pat.Tensor("conv_out")});
+    const auto &parameter_bias = pat.Op(
+        pir::ParameterOp::name(), {{"parameter_name", pat.Attr("param_name")}});
+    pat.Tensor("bias") = parameter_bias();
+    pat.Tensor("add_out") = add(pat.Tensor("conv_out"), pat.Tensor("bias"));
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      std::set<std::string> padding_algorithm = {"EXPLICIT", "SAME", "VALID"};
+      std::set<std::string> data_format = {"NCHW", "NHWC", "AnyLayout"};
+      if (padding_algorithm.count(
+              match_ctx.Attr<std::string>("padding_algorithm")) == 0 ||
+          data_format.count(match_ctx.Attr<std::string>("data_format")) == 0 ||
+          match_ctx.Attr<int>("groups") < 1) {
+        return false;
+      }
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    const auto &fused_conv =
+        res.Op(paddle::onednn::dialect::Conv2dTransposeBiasOp::name(),
+               {{
+                   {"strides", pat.Attr("strides")},
+                   {"paddings", pat.Attr("paddings")},
+                   {"output_padding", pat.Attr("output_padding")},
+                   {"padding_algorithm", pat.Attr("padding_algorithm")},
+                   {"dilations", pat.Attr("dilations")},
+                   {"groups", pat.Attr("groups")},
+                   {"data_format", pat.Attr("data_format")},
+                   {"force_fp32_output", res.BoolAttr(false)},
+                   {"mkldnn_data_type", res.StrAttr("float32")},
+                   {"fuse_relu", res.BoolAttr(false)},
+                   {"fuse_activation", res.StrAttr("")},
+                   {"fuse_alpha", res.Float32Attr(0.0f)},
+                   {"fuse_beta", res.Float32Attr(0.0f)},
+                   {"is_test", res.BoolAttr(true)},
+               }});
+
+    fused_conv({&res.Tensor("input"),
+                &res.Tensor("filter"),
+                &res.Tensor("bias"),
+                &res.Tensor("output_size")},
+               {&res.Tensor("add_out")});
+  }
+};
+
+class FusedConvTransposeAddFusePattern : public paddle::drr::DrrPatternBase {
+  std::string name() const override {
+    return "FusedConvTransposeAddFusePattern";
+  }
+
+  uint32_t benefit() const override { return 3; }
+
+  void operator()(paddle::drr::DrrPatternContext *ctx) const override {
+    paddle::drr::SourcePattern pat = ctx->SourcePattern();
+    const auto &conv =
+        pat.Op(paddle::dialect::Conv2dTransposeOp::name(),
+               {{"strides", pat.Attr("strides")},
+                {"paddings", pat.Attr("paddings")},
+                {"output_padding", pat.Attr("output_padding")},
+                {"padding_algorithm", pat.Attr("padding_algorithm")},
+                {"dilations", pat.Attr("dilations")},
+                {"groups", pat.Attr("groups")},
+                {"data_format", pat.Attr("data_format")}});
+
+    const auto &add = pat.Op(paddle::dialect::AddOp::name());
+    const auto &add2 = pat.Op(paddle::dialect::AddOp::name());
+    conv({&pat.Tensor("input"),
+          &pat.Tensor("filter"),
+          &pat.Tensor("output_size")},
+         {&pat.Tensor("conv_out")});
+    const auto &parameter_bias = pat.Op(
+        pir::ParameterOp::name(), {{"parameter_name", pat.Attr("param_name")}});
+    pat.Tensor("bias") = parameter_bias();
+
+    pat.Tensor("add_out") = add(pat.Tensor("conv_out"), pat.Tensor("bias"));
+
+    const auto &parameter = pat.Op(
+        pir::ParameterOp::name(), {{"parameter_name", pat.Attr("param_name")}});
+    pat.Tensor("other_param") = parameter();
+    pat.Tensor("result") =
+        add2(pat.Tensor("add_out"), pat.Tensor("other_param"));
+
+    pat.RequireNativeCall([&](const paddle::drr::MatchContext &match_ctx) {
+      std::set<std::string> padding_algorithm = {"EXPLICIT", "SAME", "VALID"};
+      std::set<std::string> data_format = {"NCHW", "NHWC", "AnyLayout"};
+      if (padding_algorithm.count(
+              match_ctx.Attr<std::string>("padding_algorithm")) == 0 ||
+          data_format.count(match_ctx.Attr<std::string>("data_format")) == 0 ||
+          match_ctx.Attr<int>("groups") < 1) {
+        return false;
+      }
+      return true;
+    });
+
+    paddle::drr::ResultPattern res = pat.ResultPattern();
+
+    const auto &fused_add = res.Op(paddle::dialect::AddOp::name());
+    res.Tensor("bias2") =
+        fused_add(res.Tensor("bias"), res.Tensor("other_param"));
+
+    const auto &fused_conv =
+        res.Op(paddle::onednn::dialect::Conv2dTransposeBiasOp::name(),
+               {{
+                   {"strides", pat.Attr("strides")},
+                   {"paddings", pat.Attr("paddings")},
+                   {"output_padding", pat.Attr("output_padding")},
+                   {"padding_algorithm", pat.Attr("padding_algorithm")},
+                   {"dilations", pat.Attr("dilations")},
+                   {"groups", pat.Attr("groups")},
+                   {"data_format", pat.Attr("data_format")},
+                   {"force_fp32_output", res.BoolAttr(false)},
+                   {"mkldnn_data_type", res.StrAttr("float32")},
+                   {"fuse_relu", res.BoolAttr(false)},
+                   {"fuse_activation", res.StrAttr("")},
+                   {"fuse_alpha", res.Float32Attr(0.0f)},
+                   {"fuse_beta", res.Float32Attr(0.0f)},
+                   {"is_test", res.BoolAttr(true)},
+               }});
+
+    fused_conv({&res.Tensor("input"),
+                &res.Tensor("filter"),
+                &res.Tensor("bias2"),
+                &res.Tensor("output_size")},
+               {&res.Tensor("result")});
+  }
+};
+
 class Conv2dBiasFusePass : public pir::PatternRewritePass {
  public:
   Conv2dBiasFusePass() : pir::PatternRewritePass("conv2d_bias_fuse_pass", 2) {}
@@ -240,18 +391,18 @@ class Conv2dBiasFusePass : public pir::PatternRewritePass {
   }
 };
 
-// class Conv2dTransposeBiasFusePass : public pir::PatternRewritePass {
-//  public:
-//   Conv2dTransposeBiasFusePass()
-//       : pir::PatternRewritePass("conv2d_transpose_bias_fuse_pass", 2) {}
+class Conv2dTransposeBiasFusePass : public pir::PatternRewritePass {
+ public:
+  Conv2dTransposeBiasFusePass()
+      : pir::PatternRewritePass("conv2d_transpose_bias_fuse_pass", 2) {}
 
-//   pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override
-//   {
-//     pir::RewritePatternSet ps(context);
-//     ps.Add(paddle::drr::Create<Conv2dBiasFusePattern>(context));
-//     return ps;
-//   }
-// };
+  pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
+    ps.Add(paddle::drr::Create<ConvTransposeBiasFusePattern>(context));
+    ps.Add(paddle::drr::Create<FusedConvTransposeAddFusePattern>(context));
+    return ps;
+  }
+};
 
 class Conv3dBiasFusePass : public pir::PatternRewritePass {
  public:
@@ -281,10 +432,12 @@ std::unique_ptr<Pass> CreateConv2dBiasFusePass() {
   return std::make_unique<Conv2dBiasFusePass>();
 }
 
-// std::unique_ptr<Pass> CreateConv2dTransposeBiasFusePass() {
-//   // pd_op.conv2d_transpose + pd_op.add -> onednn_op.fused_conv2d
-//   return std::make_unique<Conv2dTransposeBiasFusePass>();
-// }
+std::unique_ptr<Pass> CreateConv2dTransposeBiasFusePass() {
+  // pd_op.conv2d_transpose + pd_op.add -> onednn_op.conv2d_transpose_bias
+  // onednn_op.conv2d_transpose_bias + pd_op.add ->
+  // onednn_op.conv2d_transpose_bias + pd_op.add
+  return std::make_unique<Conv2dTransposeBiasFusePass>();
+}
 
 std::unique_ptr<Pass> CreateConv3dBiasFusePass() {
   // pd_op.conv3d + pd_op.add -> onednn_op.fused_conv3d
@@ -294,6 +447,5 @@ std::unique_ptr<Pass> CreateConv3dBiasFusePass() {
 }  // namespace pir
 
 REGISTER_IR_PASS(conv2d_bias_fuse_pass, Conv2dBiasFusePass);
-// REGISTER_IR_PASS(conv2d_transpose_bias_fuse_pass,
-// Conv2dTransposeBiasFusePass);
+REGISTER_IR_PASS(conv2d_transpose_bias_fuse_pass, Conv2dTransposeBiasFusePass);
 REGISTER_IR_PASS(conv3d_bias_fuse_pass, Conv3dBiasFusePass);
diff --git a/test/ir/pir/fused_pass/onednn/test_convtranspose_bias_fuse_pass.py b/test/ir/pir/fused_pass/onednn/test_convtranspose_bias_fuse_pass.py
new file mode 100644
index 0000000000000..5f5bf774a8373
--- /dev/null
+++ b/test/ir/pir/fused_pass/onednn/test_convtranspose_bias_fuse_pass.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from pass_test import PassTest
+
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(
+    not paddle.base.core.is_compiled_with_mkldnn(),
+    "Test case only for OneDNN pass.",
+)
+class TestConv2dTransposeAddFusePass(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                bias_attr = paddle.ParamAttr(
+                    learning_rate=0.0,
+                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
+                )
+                bias = paddle.static.create_parameter(
+                    shape=[1], dtype='float32', attr=bias_attr, is_bias=False
+                )
+                w_attr = paddle.ParamAttr(
+                    learning_rate=0.0,
+                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
+                )
+                conv2d = paddle.nn.Conv2DTranspose(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                    weight_attr=w_attr,
+                )
+
+                out = paddle.add(conv2d(x), bias)
+                out = paddle.assign(out)
+                self.pass_list = ['conv2d_transpose_bias_fuse_pass']
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "bias": np.random.random(1).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "onednn_op.conv2d_transpose_bias": 1,
+                    "pd_op.conv2d_transpose": 0,
+                    "pd_op.add": 0,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+@unittest.skipIf(
+    not paddle.base.core.is_compiled_with_mkldnn(),
+    "Test case only for OneDNN pass.",
+)
+class TestConv2dTransposeAddFusePassWithAddParam(PassTest):
+    def is_program_valid(self, program=None):
+        return True
+
+    def build_ir_program(self):
+        with paddle.pir_utils.IrGuard():
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.pir.core.program_guard(main_prog, start_prog):
+                x = paddle.static.data(
+                    name='x', shape=[5, 5, 5, 5], dtype='float32'
+                )
+                bias_attr = paddle.ParamAttr(
+                    learning_rate=0.0,
+                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
+                )
+                bias = paddle.static.create_parameter(
+                    shape=[1], dtype='float32', attr=bias_attr, is_bias=False
+                )
+                w_attr = paddle.ParamAttr(
+                    learning_rate=0.0,
+                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
+                )
+                conv2d = paddle.nn.Conv2DTranspose(
+                    in_channels=5,
+                    out_channels=1,
+                    kernel_size=[1, 1],
+                    groups=1,
+                    stride=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    data_format='NCHW',
+                    bias_attr=False,
+                    weight_attr=w_attr,
+                )
+                add_out = paddle.add(conv2d(x), bias)
+                other_param_attr = paddle.ParamAttr(
+                    learning_rate=0.0,
+                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
+                )
+                other_param = paddle.static.create_parameter(
+                    shape=[1], dtype='float32', attr=bias_attr, is_bias=False
+                )
+                out = paddle.add(add_out, other_param)
+                out = paddle.assign(out)
+                self.pass_list = ['conv2d_transpose_bias_fuse_pass']
+                self.feeds = {
+                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
+                    "bias": np.random.random(1).astype("float32"),
+                }
+                self.fetch_list = [out]
+                self.valid_op_map = {
+                    "onednn_op.conv2d_transpose_bias": 1,
+                    "pd_op.conv2d_transpose": 0,
+                    "pd_op.add": 1,
+                }
+                return [main_prog, start_prog]
+
+    def sample_program(self):
+        yield self.build_ir_program(), False
+
+    def setUp(self):
+        self.places.append(paddle.CPUPlace())
+
+    def test_check_output(self):
+        self.check_pass_correct()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py b/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
index 09c3c1172354f..53b9deb3d85b9 100644
--- a/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
@@ -35,7 +35,7 @@ def conv2d_bias_naive(out, bias):
 )
 class TestConv2DTransposeBF16MKLDNNOp(OpTest):
     def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
+        self.check_output_with_place(core.CPUPlace(), check_pir_onednn=True)
 
     def test_check_grad(self):
         pass

From 68cb8d731b8ff81346ac65433260e822128b740f Mon Sep 17 00:00:00 2001
From: ronnywang <ronny1996@163.com>
Date: Thu, 7 Mar 2024 13:52:23 +0800
Subject: [PATCH 050/114] [CustomDevice] replace phi::ccl::CCLDataType with 
 phi::DataType (#62464)

---
 .../collective/process_group_custom.cc        | 11 ++-
 paddle/fluid/imperative/xccl_context.cc       |  6 +-
 .../custom_device_common_op_registry.cc       | 20 +++---
 paddle/phi/backends/c_comm_lib.h              | 56 ---------------
 paddle/phi/backends/custom/custom_device.cc   | 68 ++++++-------------
 paddle/phi/backends/device_base.cc            | 18 ++---
 paddle/phi/backends/device_base.h             | 18 ++---
 paddle/phi/backends/device_manager.cc         | 18 ++---
 paddle/phi/backends/device_manager.h          | 18 ++---
 .../phi/core/distributed/xccl_comm_context.cc | 31 ++++-----
 paddle/phi/kernels/cpu/all_to_all_kernel.cc   |  3 +-
 .../device/custom/custom_device_test.cc       | 57 +++++-----------
 12 files changed, 109 insertions(+), 215 deletions(-)

diff --git a/paddle/fluid/distributed/collective/process_group_custom.cc b/paddle/fluid/distributed/collective/process_group_custom.cc
index 33b2728bdc288..fd04bb9909f3e 100644
--- a/paddle/fluid/distributed/collective/process_group_custom.cc
+++ b/paddle/fluid/distributed/collective/process_group_custom.cc
@@ -236,7 +236,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
 
         std::vector<void*> send_buf, recv_buf;
         std::vector<size_t> send_count, recv_count;
-        std::vector<phi::ccl::CCLDataType> send_dtype, recv_dtype;
+        std::vector<phi::DataType> send_dtype, recv_dtype;
         for (auto i = 0; i < size_; i++) {
           in_numel = in_size_each_rank[i] * in_row_size;
           input_partial = GetPartialTensor(tensor_tmp, in_offset, in_numel);
@@ -248,8 +248,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
           recv_buf.push_back(output_partial.data());
           send_count.push_back(in_numel);
           recv_count.push_back(out_numel);
-          send_dtype.push_back(phi::ccl::ToCCLDataType(input_partial.dtype()));
-          recv_dtype.push_back(phi::ccl::ToCCLDataType(output_partial.dtype()));
+          send_dtype.push_back(input_partial.dtype());
+          recv_dtype.push_back(output_partial.dtype());
         }
 
         phi::DeviceManager::CCLAllToAll(
@@ -992,9 +992,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllToAll(
         std::vector<void*> send_buf, recv_buf;
         std::vector<size_t> send_count(size_, input.numel() / size_),
             recv_count(size_, input.numel() / size_);
-        std::vector<phi::ccl::CCLDataType> send_dtype(
-            size_, phi::ccl::ToCCLDataType(input.dtype())),
-            recv_dtype(size_, phi::ccl::ToCCLDataType(input.dtype()));
+        std::vector<phi::DataType> send_dtype(size_, input.dtype()),
+            recv_dtype(size_, input.dtype());
         for (auto i = 0; i < size_; i++) {
           send_buf.push_back(
               GetPointerByOffset(input.data(), offset, input.dtype()));
diff --git a/paddle/fluid/imperative/xccl_context.cc b/paddle/fluid/imperative/xccl_context.cc
index 1ed821d09c346..1eca9f9361419 100644
--- a/paddle/fluid/imperative/xccl_context.cc
+++ b/paddle/fluid/imperative/xccl_context.cc
@@ -50,13 +50,12 @@ static void XcclAllReduce(const phi::DenseTensor &src,
   auto *dst_ptr = phi::DeviceContextPool::Instance()
                       .Get(src.place())
                       ->Alloc(dst, src.dtype());
-  auto xccl_dtype = phi::ccl::ToCCLDataType(src.dtype());
 
   phi::DeviceManager::CCLAllReduce(place.GetDeviceType(),
                                    src_ptr,
                                    dst_ptr,
                                    src.numel(),
-                                   xccl_dtype,
+                                   src.dtype(),
                                    phi::ccl::CCLReduceOp::SUM,
                                    comm,
                                    stream);
@@ -201,12 +200,11 @@ void XCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
   auto stream = comm->stream();
 
   void *src_ptr = src_tensor->data();
-  auto xccl_dtype = phi::ccl::ToCCLDataType(src_tensor->dtype());
 
   phi::DeviceManager::CCLBroadcast(place_.GetDeviceType(),
                                    src_ptr,
                                    src_tensor->numel(),
-                                   xccl_dtype,
+                                   src_tensor->dtype(),
                                    0,
                                    comm->comm(),
                                    *stream);
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
index 950b7f0663658..d63197af754f2 100644
--- a/paddle/fluid/operators/custom_device_common_op_registry.cc
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -120,7 +120,7 @@ class CConcatOpCustomDeviceKernel : public framework::OpKernel<T> {
           reinterpret_cast<void*>(const_cast<T*>(send_buff)),
           recv_buff,
           send_numel,
-          phi::ccl::ToCCLDataType(x->dtype()),
+          x->dtype(),
           comm->GetXcclComm(),
           stream);
     }
@@ -560,7 +560,7 @@ class CAllReduceOpCustomDeviceKernel : public framework::OpKernel<T> {
     int rid = ctx.Attr<int>("ring_id");
 
     auto place = ctx.GetPlace();
-    auto dtype = phi::ccl::ToCCLDataType(in->dtype());
+    auto dtype = in->dtype();
     int64_t numel = in->numel();
     const void* sendbuff = in->data<T>();
     out->Resize(in->dims());
@@ -651,7 +651,7 @@ class CBroadcastOpCustomDeviceKernel : public framework::OpKernel<T> {
     }
 
     int numel = x->numel();
-    auto dtype = phi::ccl::ToCCLDataType(x->dtype());
+    auto dtype = x->dtype();
     if (root == comm->GetRank()) {
       phi::DeviceManager::CCLBroadcast(place.GetDeviceType(),
                                        const_cast<void*>(x->data()),
@@ -712,7 +712,7 @@ class BarrierOpCustomDeviceKernel : public framework::OpKernel<T> {
                                      const_cast<void*>(sendbuff),
                                      recvbuff,
                                      numel,
-                                     phi::ccl::ToCCLDataType(in->dtype()),
+                                     in->dtype(),
                                      phi::ccl::CCLReduceOp::SUM,
                                      comm->GetXcclComm(),
                                      *stream);
@@ -1059,7 +1059,7 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
                 place.GetDeviceType(),
                 reinterpret_cast<void*>(recv_buf + recv_ptr * in_feat),
                 cpu_global_count_data[idx] * in_feat,
-                phi::ccl::ToCCLDataType(x->dtype()),
+                x->dtype(),
                 j,
                 comm->GetXcclComm(),
                 *stream);
@@ -1075,7 +1075,7 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
                   const_cast<void*>(reinterpret_cast<const void*>(
                       send_buf + expert_ptr[idx] * in_feat)),
                   cpu_local_count_data[idx] * in_feat,
-                  phi::ccl::ToCCLDataType(x->dtype()),
+                  x->dtype(),
                   j,
                   comm->GetXcclComm(),
                   *stream);
@@ -1098,7 +1098,7 @@ class GlobalScatterOpCustomDeviceKernel : public framework::OpKernel<T> {
                 place.GetDeviceType(),
                 reinterpret_cast<void*>(recv_buf + recv_ptr * in_feat),
                 cpu_global_count_data[idx] * in_feat,
-                phi::ccl::ToCCLDataType(x->dtype()),
+                x->dtype(),
                 j,
                 comm->GetXcclComm(),
                 *stream);
@@ -1269,7 +1269,7 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
             phi::DeviceManager::CCLRecv(place.GetDeviceType(),
                                         recv_buf + expert_ptr[idx] * in_feat,
                                         cpu_local_count_data[idx] * in_feat,
-                                        phi::ccl::ToCCLDataType(x->dtype()),
+                                        x->dtype(),
                                         j,
                                         comm->GetXcclComm(),
                                         *stream);
@@ -1284,7 +1284,7 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
                   const_cast<void*>(reinterpret_cast<const void*>(
                       send_buf + send_ptr * in_feat)),
                   cpu_global_count_data[idx] * in_feat,
-                  phi::ccl::ToCCLDataType(x->dtype()),
+                  x->dtype(),
                   j,
                   comm->GetXcclComm(),
                   *stream);
@@ -1305,7 +1305,7 @@ class GlobalGatherOpCustomDeviceKernel : public framework::OpKernel<T> {
             phi::DeviceManager::CCLRecv(place.GetDeviceType(),
                                         recv_buf + expert_ptr[idx] * in_feat,
                                         cpu_local_count_data[idx] * in_feat,
-                                        phi::ccl::ToCCLDataType(x->dtype()),
+                                        x->dtype(),
                                         j,
                                         comm->GetXcclComm(),
                                         *stream);
diff --git a/paddle/phi/backends/c_comm_lib.h b/paddle/phi/backends/c_comm_lib.h
index 3405b2f33bb58..b21ad1b7fedfe 100644
--- a/paddle/phi/backends/c_comm_lib.h
+++ b/paddle/phi/backends/c_comm_lib.h
@@ -29,17 +29,6 @@ typedef void* CCLComm;
 typedef std::vector<uint8_t> CCLRootId;
 
 enum CCLReduceOp { SUM = 0, AVG, MAX, MIN, PRODUCT };
-enum CCLDataType {
-  CCL_DATA_TYPE_FP64 = 0,
-  CCL_DATA_TYPE_FP32,
-  CCL_DATA_TYPE_FP16,
-  CCL_DATA_TYPE_BF16,
-  CCL_DATA_TYPE_INT64,
-  CCL_DATA_TYPE_INT32,
-  CCL_DATA_TYPE_INT16,
-  CCL_DATA_TYPE_INT8,
-  CCL_DATA_TYPE_UINT8
-};
 
 inline CCLReduceOp ToXCCLReduceOp(int reduce_type) {
   phi::ccl::CCLReduceOp red_type = phi::ccl::CCLReduceOp::SUM;
@@ -67,51 +56,6 @@ inline CCLReduceOp ToXCCLReduceOp(int reduce_type) {
   return red_type;
 }
 
-inline CCLDataType ToCCLDataType(phi::DataType type) {
-  if (type == phi::DataType::FLOAT64) {
-    return CCL_DATA_TYPE_FP64;
-  } else if (type == phi::DataType::FLOAT32) {
-    return CCL_DATA_TYPE_FP32;
-  } else if (type == phi::DataType::FLOAT16) {
-    return CCL_DATA_TYPE_FP16;
-  } else if (type == phi::DataType::BFLOAT16) {
-    return CCL_DATA_TYPE_BF16;
-  } else if (type == phi::DataType::INT64) {
-    return CCL_DATA_TYPE_INT64;
-  } else if (type == phi::DataType::INT32) {
-    return CCL_DATA_TYPE_INT32;
-  } else if (type == phi::DataType::INT8) {
-    return CCL_DATA_TYPE_INT8;
-  } else if (type == phi::DataType::UINT8) {
-    return CCL_DATA_TYPE_UINT8;
-  } else {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("This datatype %s in CCL is not supported.",
-                                   phi::DataTypeToString(type)));
-  }
-}
-
-inline phi::DataType ToPhiDataType(CCLDataType type) {
-  if (type == CCLDataType::CCL_DATA_TYPE_FP64) {
-    return phi::DataType::FLOAT64;
-  } else if (type == CCLDataType::CCL_DATA_TYPE_FP32) {
-    return phi::DataType::FLOAT32;
-  } else if (type == CCLDataType::CCL_DATA_TYPE_FP16) {
-    return phi::DataType::FLOAT16;
-  } else if (type == CCLDataType::CCL_DATA_TYPE_BF16) {
-    return phi::DataType::BFLOAT16;
-  } else if (type == CCLDataType::CCL_DATA_TYPE_INT64) {
-    return phi::DataType::INT64;
-  } else if (type == CCLDataType::CCL_DATA_TYPE_INT32) {
-    return phi::DataType::INT32;
-  } else if (type == CCLDataType::CCL_DATA_TYPE_INT8) {
-    return phi::DataType::INT8;
-  } else {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("This datatype in CCL is not supported."));
-  }
-}
-
 inline std::string SerializeXCCLUniqueId(const phi::ccl::CCLRootId& ccl_id) {
   const uint8_t* bytes = ccl_id.data();
   std::ostringstream oss;
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index e7f58bb39b25c..30282eac79afb 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -569,29 +569,6 @@ class CustomDevice : public DeviceInterface {
     return version;
   }
 
-  C_DataType ToXCCLDataType(ccl::CCLDataType data_type) {
-#define return_result(in, ret) \
-  case ccl::CCLDataType::in:   \
-    return C_DataType::ret
-    switch (data_type) {
-      return_result(CCL_DATA_TYPE_FP64, FLOAT64);
-      return_result(CCL_DATA_TYPE_FP32, FLOAT32);
-      return_result(CCL_DATA_TYPE_FP16, FLOAT16);
-      return_result(CCL_DATA_TYPE_BF16, BFLOAT16);
-      return_result(CCL_DATA_TYPE_INT64, INT64);
-      return_result(CCL_DATA_TYPE_INT32, INT32);
-      return_result(CCL_DATA_TYPE_INT16, INT16);
-      return_result(CCL_DATA_TYPE_INT8, INT8);
-      return_result(CCL_DATA_TYPE_UINT8, UINT8);
-      default: {
-        PADDLE_THROW(phi::errors::Unavailable(
-            "DataType is not supported on %s.", Type()));
-        return C_DataType::UNDEFINED;
-      }
-    }
-#undef return_result
-  }
-
   C_CCLReduceOp ToXCCLReduceOp(ccl::CCLReduceOp reduce_op) {
 #define return_result(in, ret) \
   case ccl::CCLReduceOp::in:   \
@@ -669,7 +646,7 @@ class CustomDevice : public DeviceInterface {
   void CCLAllReduce(void* send_buf,
                     void* recv_buf,
                     size_t count,
-                    ccl::CCLDataType data_type,
+                    phi::DataType data_type,
                     ccl::CCLReduceOp op,
                     const ccl::CCLComm& comm,
                     const stream::Stream& stream) override {
@@ -678,7 +655,7 @@ class CustomDevice : public DeviceInterface {
         send_buf,
         recv_buf,
         count,
-        ToXCCLDataType(data_type),
+        ToCDatatType(data_type),
         ToXCCLReduceOp(op),
         reinterpret_cast<C_CCLComm>(comm),
         reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -686,7 +663,7 @@ class CustomDevice : public DeviceInterface {
 
   void CCLBroadcast(void* buf,
                     size_t count,
-                    ccl::CCLDataType data_type,
+                    phi::DataType data_type,
                     size_t root,
                     const ccl::CCLComm& comm,
                     const stream::Stream& stream) override {
@@ -694,7 +671,7 @@ class CustomDevice : public DeviceInterface {
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_broadcast(
         buf,
         count,
-        ToXCCLDataType(data_type),
+        ToCDatatType(data_type),
         root,
         reinterpret_cast<C_CCLComm>(comm),
         reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -703,7 +680,7 @@ class CustomDevice : public DeviceInterface {
   void CCLReduce(void* in_data,
                  void* out_data,
                  size_t num,
-                 ccl::CCLDataType data_type,
+                 phi::DataType data_type,
                  ccl::CCLReduceOp reduce_op,
                  size_t root_id,
                  const ccl::CCLComm& comm,
@@ -713,7 +690,7 @@ class CustomDevice : public DeviceInterface {
         pimpl_->xccl_reduce(in_data,
                             out_data,
                             num,
-                            ToXCCLDataType(data_type),
+                            ToCDatatType(data_type),
                             ToXCCLReduceOp(reduce_op),
                             root_id,
                             reinterpret_cast<C_CCLComm>(comm),
@@ -723,7 +700,7 @@ class CustomDevice : public DeviceInterface {
   void CCLAllGather(void* send_buf,
                     void* recv_buf,
                     size_t count,
-                    ccl::CCLDataType data_type,
+                    phi::DataType data_type,
                     const ccl::CCLComm& comm,
                     const stream::Stream& stream) override {
     CHECK_PTR(pimpl_->xccl_all_gather);
@@ -731,7 +708,7 @@ class CustomDevice : public DeviceInterface {
         send_buf,
         recv_buf,
         count,
-        ToXCCLDataType(data_type),
+        ToCDatatType(data_type),
         reinterpret_cast<C_CCLComm>(comm),
         reinterpret_cast<C_Stream>(stream.raw_stream())));
   }
@@ -739,7 +716,7 @@ class CustomDevice : public DeviceInterface {
   void CCLReduceScatter(void* send_buf,
                         void* recv_buf,
                         size_t count,
-                        ccl::CCLDataType data_type,
+                        phi::DataType data_type,
                         ccl::CCLReduceOp reduce_op,
                         const ccl::CCLComm& comm,
                         const stream::Stream& stream) override {
@@ -748,7 +725,7 @@ class CustomDevice : public DeviceInterface {
         send_buf,
         recv_buf,
         count,
-        ToXCCLDataType(data_type),
+        ToCDatatType(data_type),
         ToXCCLReduceOp(reduce_op),
         reinterpret_cast<C_CCLComm>(comm),
         reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -768,7 +745,7 @@ class CustomDevice : public DeviceInterface {
 
   void CCLSend(void* send_buf,
                size_t count,
-               ccl::CCLDataType data_type,
+               phi::DataType data_type,
                size_t dest_rank,
                const ccl::CCLComm& comm,
                const stream::Stream& stream) override {
@@ -776,7 +753,7 @@ class CustomDevice : public DeviceInterface {
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
         pimpl_->xccl_send(send_buf,
                           count,
-                          ToXCCLDataType(data_type),
+                          ToCDatatType(data_type),
                           dest_rank,
                           reinterpret_cast<C_CCLComm>(comm),
                           reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -784,7 +761,7 @@ class CustomDevice : public DeviceInterface {
 
   void CCLRecv(void* recv_buf,
                size_t count,
-               ccl::CCLDataType data_type,
+               phi::DataType data_type,
                size_t src_rank,
                const ccl::CCLComm& comm,
                const stream::Stream& stream) override {
@@ -792,7 +769,7 @@ class CustomDevice : public DeviceInterface {
     PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
         pimpl_->xccl_recv(recv_buf,
                           count,
-                          ToXCCLDataType(data_type),
+                          ToCDatatType(data_type),
                           src_rank,
                           reinterpret_cast<C_CCLComm>(comm),
                           reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -800,10 +777,10 @@ class CustomDevice : public DeviceInterface {
 
   void CCLAllToAll(const void** send_buf,
                    const size_t* send_count,
-                   const ccl::CCLDataType* send_dtype,
+                   const phi::DataType* send_dtype,
                    void** recv_buf,
                    const size_t* recv_count,
-                   const ccl::CCLDataType* recv_dtype,
+                   const phi::DataType* recv_dtype,
                    size_t rank,
                    size_t nranks,
                    const ccl::CCLComm& comm,
@@ -811,8 +788,8 @@ class CustomDevice : public DeviceInterface {
     if (pimpl_->xccl_all_to_all) {
       std::vector<C_DataType> c_send_dtype, c_recv_dtype;
       for (size_t i = 0; i < nranks; ++i) {
-        c_send_dtype.push_back(ToXCCLDataType(send_dtype[i]));
-        c_recv_dtype.push_back(ToXCCLDataType(recv_dtype[i]));
+        c_send_dtype.push_back(ToCDatatType(send_dtype[i]));
+        c_recv_dtype.push_back(ToCDatatType(recv_dtype[i]));
       }
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_all_to_all(
           send_buf,
@@ -832,7 +809,7 @@ class CustomDevice : public DeviceInterface {
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
             pimpl_->xccl_recv(recv_buf[i],
                               recv_count[i],
-                              ToXCCLDataType(recv_dtype[i]),
+                              ToCDatatType(recv_dtype[i]),
                               i,
                               reinterpret_cast<C_CCLComm>(comm),
                               reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -842,7 +819,7 @@ class CustomDevice : public DeviceInterface {
           PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_send(
               const_cast<void*>(send_buf[i]),
               send_count[i],
-              ToXCCLDataType(send_dtype[i]),
+              ToCDatatType(send_dtype[i]),
               i,
               reinterpret_cast<C_CCLComm>(comm),
               reinterpret_cast<C_Stream>(stream.raw_stream())));
@@ -851,14 +828,13 @@ class CustomDevice : public DeviceInterface {
       MemoryCopyD2D(rank,
                     recv_buf[rank],
                     send_buf[rank],
-                    send_count[rank] *
-                        phi::SizeOf(phi::ccl::ToPhiDataType(send_dtype[rank])),
+                    send_count[rank] * phi::SizeOf(send_dtype[rank]),
                     &stream);
       for (size_t i = rank + 1; i < nranks; ++i) {
         PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
             pimpl_->xccl_recv(recv_buf[i],
                               recv_count[i],
-                              ToXCCLDataType(recv_dtype[i]),
+                              ToCDatatType(recv_dtype[i]),
                               i,
                               reinterpret_cast<C_CCLComm>(comm),
                               reinterpret_cast<C_Stream>(stream.raw_stream())));
diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc
index 7860d322f1faa..44d506301fbbd 100644
--- a/paddle/phi/backends/device_base.cc
+++ b/paddle/phi/backends/device_base.cc
@@ -284,7 +284,7 @@ void DeviceInterface::CCLGetUniqueId(ccl::CCLRootId* root_id) {
 
 void DeviceInterface::CCLBroadcast(void* data,
                                    size_t num,
-                                   ccl::CCLDataType data_type,
+                                   phi::DataType data_type,
                                    size_t root,
                                    const ccl::CCLComm& ccl_comm,
                                    const stream::Stream& stream) {
@@ -294,7 +294,7 @@ void DeviceInterface::CCLBroadcast(void* data,
 void DeviceInterface::CCLAllReduce(void* in_data,
                                    void* out_data,
                                    size_t num,
-                                   ccl::CCLDataType data_type,
+                                   phi::DataType data_type,
                                    ccl::CCLReduceOp reduce_op,
                                    const ccl::CCLComm& ccl_comm,
                                    const stream::Stream& stream) {
@@ -304,7 +304,7 @@ void DeviceInterface::CCLAllReduce(void* in_data,
 void DeviceInterface::CCLReduce(void* in_data,
                                 void* out_data,
                                 size_t num,
-                                ccl::CCLDataType data_type,
+                                phi::DataType data_type,
                                 ccl::CCLReduceOp reduce_op,
                                 size_t root_id,
                                 const ccl::CCLComm& ccl_comm,
@@ -315,7 +315,7 @@ void DeviceInterface::CCLReduce(void* in_data,
 void DeviceInterface::CCLAllGather(void* in_data,
                                    void* out_data,
                                    size_t num,
-                                   ccl::CCLDataType data_type,
+                                   phi::DataType data_type,
                                    const ccl::CCLComm& ccl_comm,
                                    const stream::Stream& stream) {
   INTERFACE_UNIMPLEMENT;
@@ -324,7 +324,7 @@ void DeviceInterface::CCLAllGather(void* in_data,
 void DeviceInterface::CCLReduceScatter(void* in_data,
                                        void* out_data,
                                        size_t num,
-                                       ccl::CCLDataType data_type,
+                                       phi::DataType data_type,
                                        ccl::CCLReduceOp op,
                                        const ccl::CCLComm& ccl_comm,
                                        const stream::Stream& stream) {
@@ -337,7 +337,7 @@ void DeviceInterface::CCLGroupEnd() { INTERFACE_UNIMPLEMENT; }
 
 void DeviceInterface::CCLSend(void* sendbuf,
                               size_t num,
-                              ccl::CCLDataType data_type,
+                              phi::DataType data_type,
                               size_t dst_rank,
                               const ccl::CCLComm& ccl_comm,
                               const stream::Stream& stream) {
@@ -346,7 +346,7 @@ void DeviceInterface::CCLSend(void* sendbuf,
 
 void DeviceInterface::CCLRecv(void* recvbuf,
                               size_t num,
-                              ccl::CCLDataType data_type,
+                              phi::DataType data_type,
                               size_t src_rank,
                               const ccl::CCLComm& ccl_comm,
                               const stream::Stream& stream) {
@@ -355,10 +355,10 @@ void DeviceInterface::CCLRecv(void* recvbuf,
 
 void DeviceInterface::CCLAllToAll(const void** send_buf,
                                   const size_t* send_count,
-                                  const ccl::CCLDataType* send_dtype,
+                                  const phi::DataType* send_dtype,
                                   void** recv_buf,
                                   const size_t* recv_count,
-                                  const ccl::CCLDataType* recv_dtype,
+                                  const phi::DataType* recv_dtype,
                                   size_t rank,
                                   size_t nranks,
                                   const ccl::CCLComm& comm,
diff --git a/paddle/phi/backends/device_base.h b/paddle/phi/backends/device_base.h
index 855e77890348a..66d5b2ea511db 100644
--- a/paddle/phi/backends/device_base.h
+++ b/paddle/phi/backends/device_base.h
@@ -180,7 +180,7 @@ class DeviceInterface {  // Driver / Runtime
 
   virtual void CCLBroadcast(void* data,
                             size_t num,
-                            ccl::CCLDataType data_type,
+                            phi::DataType data_type,
                             size_t root,
                             const ccl::CCLComm& ccl_comm,
                             const stream::Stream& stream);
@@ -188,14 +188,14 @@ class DeviceInterface {  // Driver / Runtime
   virtual void CCLAllReduce(void* in_data,
                             void* out_data,
                             size_t num,
-                            ccl::CCLDataType data_type,
+                            phi::DataType data_type,
                             ccl::CCLReduceOp reduce_op,
                             const ccl::CCLComm& ccl_comm,
                             const stream::Stream& stream);
   virtual void CCLReduce(void* in_data,
                          void* out_data,
                          size_t num,
-                         ccl::CCLDataType data_type,
+                         phi::DataType data_type,
                          ccl::CCLReduceOp reduce_op,
                          size_t root_id,
                          const ccl::CCLComm& ccl_comm,
@@ -203,13 +203,13 @@ class DeviceInterface {  // Driver / Runtime
   virtual void CCLAllGather(void* in_data,
                             void* out_data,
                             size_t num,
-                            ccl::CCLDataType data_type,
+                            phi::DataType data_type,
                             const ccl::CCLComm& ccl_comm,
                             const stream::Stream& stream);
   virtual void CCLReduceScatter(void* in_data,
                                 void* out_data,
                                 size_t num,
-                                ccl::CCLDataType data_type,
+                                phi::DataType data_type,
                                 ccl::CCLReduceOp op,
                                 const ccl::CCLComm& ccl_comm,
                                 const stream::Stream& stream);
@@ -217,23 +217,23 @@ class DeviceInterface {  // Driver / Runtime
   virtual void CCLGroupEnd();
   virtual void CCLSend(void* sendbuf,
                        size_t num,
-                       ccl::CCLDataType data_type,
+                       phi::DataType data_type,
                        size_t dst_rank,
                        const ccl::CCLComm& ccl_comm,
                        const stream::Stream& stream);
   virtual void CCLRecv(void* recvbuf,
                        size_t num,
-                       ccl::CCLDataType data_type,
+                       phi::DataType data_type,
                        size_t src_rank,
                        const ccl::CCLComm& ccl_comm,
                        const stream::Stream& stream);
 
   virtual void CCLAllToAll(const void** send_buf,
                            const size_t* send_count,
-                           const ccl::CCLDataType* send_dtype,
+                           const phi::DataType* send_dtype,
                            void** recv_buf,
                            const size_t* recv_count,
-                           const ccl::CCLDataType* recv_dtype,
+                           const phi::DataType* recv_dtype,
                            size_t rank,
                            size_t nranks,
                            const ccl::CCLComm& comm,
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index e3ec68e7f9182..b030ba00e90f9 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -533,7 +533,7 @@ void DeviceManager::CCLGetUniqueId(const std::string& device_type,
 void DeviceManager::CCLBroadcast(const std::string& device_type,
                                  void* data,
                                  size_t num,
-                                 ccl::CCLDataType data_type,
+                                 phi::DataType data_type,
                                  size_t root_id,
                                  const ccl::CCLComm& ccl_comm,
                                  const stream::Stream& stream) {
@@ -545,7 +545,7 @@ void DeviceManager::CCLAllReduce(const std::string& device_type,
                                  void* in_data,
                                  void* out_data,
                                  size_t num,
-                                 ccl::CCLDataType data_type,
+                                 phi::DataType data_type,
                                  ccl::CCLReduceOp reduce_op,
                                  const ccl::CCLComm& ccl_comm,
                                  const stream::Stream& stream) {
@@ -558,7 +558,7 @@ void DeviceManager::CCLReduce(const std::string& device_type,
                               void* in_data,
                               void* out_data,
                               size_t num,
-                              ccl::CCLDataType data_type,
+                              phi::DataType data_type,
                               ccl::CCLReduceOp reduce_op,
                               size_t root_id,
                               const ccl::CCLComm& ccl_comm,
@@ -572,7 +572,7 @@ void DeviceManager::CCLAllGather(const std::string& device_type,
                                  void* in_data,
                                  void* out_data,
                                  size_t num,
-                                 ccl::CCLDataType data_type,
+                                 phi::DataType data_type,
                                  const ccl::CCLComm& ccl_comm,
                                  const stream::Stream& stream) {
   auto dev_impl = GetDeviceInterfaceWithType(device_type);
@@ -583,7 +583,7 @@ void DeviceManager::CCLReduceScatter(const std::string& device_type,
                                      void* in_data,
                                      void* out_data,
                                      size_t num,
-                                     ccl::CCLDataType data_type,
+                                     phi::DataType data_type,
                                      ccl::CCLReduceOp op,
                                      const ccl::CCLComm& ccl_comm,
                                      const stream::Stream& stream) {
@@ -605,7 +605,7 @@ void DeviceManager::CCLGroupEnd(const std::string& device_type) {
 void DeviceManager::CCLSend(const std::string& device_type,
                             void* sendbuf,
                             size_t num,
-                            ccl::CCLDataType data_type,
+                            phi::DataType data_type,
                             size_t dst_rank,
                             const ccl::CCLComm& ccl_comm,
                             const stream::Stream& stream) {
@@ -616,7 +616,7 @@ void DeviceManager::CCLSend(const std::string& device_type,
 void DeviceManager::CCLRecv(const std::string& device_type,
                             void* recvbuf,
                             size_t num,
-                            ccl::CCLDataType data_type,
+                            phi::DataType data_type,
                             size_t src_rank,
                             const ccl::CCLComm& ccl_comm,
                             const stream::Stream& stream) {
@@ -627,10 +627,10 @@ void DeviceManager::CCLRecv(const std::string& device_type,
 void DeviceManager::CCLAllToAll(const std::string& device_type,
                                 const void** send_buf,
                                 const size_t* send_count,
-                                const ccl::CCLDataType* send_dtype,
+                                const phi::DataType* send_dtype,
                                 void** recv_buf,
                                 const size_t* recv_count,
-                                const ccl::CCLDataType* recv_dtype,
+                                const phi::DataType* recv_dtype,
                                 size_t rank,
                                 size_t nranks,
                                 const ccl::CCLComm& comm,
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index 58a9e6ebe7ab8..ba173601e1a88 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -190,7 +190,7 @@ class DeviceManager {
   static void CCLBroadcast(const std::string& device_type,
                            void* data,
                            size_t num,
-                           ccl::CCLDataType data_type,
+                           phi::DataType data_type,
                            size_t root,
                            const ccl::CCLComm& ccl_comm,
                            const stream::Stream& stream);
@@ -198,7 +198,7 @@ class DeviceManager {
                            void* in_data,
                            void* out_data,
                            size_t num,
-                           ccl::CCLDataType data_type,
+                           phi::DataType data_type,
                            ccl::CCLReduceOp reduce_op,
                            const ccl::CCLComm& ccl_comm,
                            const stream::Stream& stream);
@@ -206,7 +206,7 @@ class DeviceManager {
                         void* in_data,
                         void* out_data,
                         size_t num,
-                        ccl::CCLDataType data_type,
+                        phi::DataType data_type,
                         ccl::CCLReduceOp reduce_op,
                         size_t root_id,
                         const ccl::CCLComm& ccl_comm,
@@ -215,14 +215,14 @@ class DeviceManager {
                            void* in_data,
                            void* out_data,
                            size_t num,
-                           ccl::CCLDataType data_type,
+                           phi::DataType data_type,
                            const ccl::CCLComm& ccl_comm,
                            const stream::Stream& stream);
   static void CCLReduceScatter(const std::string& device_type,
                                void* in_data,
                                void* out_data,
                                size_t num,
-                               ccl::CCLDataType data_type,
+                               phi::DataType data_type,
                                ccl::CCLReduceOp op,
                                const ccl::CCLComm& ccl_comm,
                                const stream::Stream& stream);
@@ -231,14 +231,14 @@ class DeviceManager {
   static void CCLSend(const std::string& device_type,
                       void* sendbuf,
                       size_t num,
-                      ccl::CCLDataType data_type,
+                      phi::DataType data_type,
                       size_t dst_rank,
                       const ccl::CCLComm& ccl_comm,
                       const stream::Stream& stream);
   static void CCLRecv(const std::string& device_type,
                       void* recvbuf,
                       size_t num,
-                      ccl::CCLDataType data_type,
+                      phi::DataType data_type,
                       size_t src_rank,
                       const ccl::CCLComm& ccl_comm,
                       const stream::Stream& stream);
@@ -246,10 +246,10 @@ class DeviceManager {
   static void CCLAllToAll(const std::string& device_type,
                           const void** send_buf,
                           const size_t* send_count,
-                          const ccl::CCLDataType* send_dtype,
+                          const phi::DataType* send_dtype,
                           void** recv_buf,
                           const size_t* recv_count,
-                          const ccl::CCLDataType* recv_dtype,
+                          const phi::DataType* recv_dtype,
                           size_t rank,
                           size_t nranks,
                           const ccl::CCLComm& comm,
diff --git a/paddle/phi/core/distributed/xccl_comm_context.cc b/paddle/phi/core/distributed/xccl_comm_context.cc
index 3e3608e4d88a5..4dd2bcc48857c 100644
--- a/paddle/phi/core/distributed/xccl_comm_context.cc
+++ b/paddle/phi/core/distributed/xccl_comm_context.cc
@@ -81,7 +81,7 @@ void XCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
     phi::DeviceManager::CCLBroadcast(place_.GetDeviceType(),
                                      const_cast<void*>(in_tensor.data()),
                                      in_tensor.numel(),
-                                     phi::ccl::ToCCLDataType(in_tensor.dtype()),
+                                     in_tensor.dtype(),
                                      root,
                                      xccl_comm_,
                                      stream);
@@ -89,7 +89,7 @@ void XCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
     phi::DeviceManager::CCLBroadcast(place_.GetDeviceType(),
                                      out_tensor->data(),
                                      out_tensor->numel(),
-                                     phi::ccl::ToCCLDataType(in_tensor.dtype()),
+                                     in_tensor.dtype(),
                                      root,
                                      xccl_comm_,
                                      stream);
@@ -110,7 +110,7 @@ void XCCLCommContext::AllGather(phi::DenseTensor* out_tensor,
                                    const_cast<void*>(in_tensor.data()),
                                    out_tensor->data(),
                                    in_tensor.numel(),
-                                   phi::ccl::ToCCLDataType(in_tensor.dtype()),
+                                   in_tensor.dtype(),
                                    xccl_comm_,
                                    stream);
 }
@@ -125,15 +125,14 @@ void XCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor,
       /*cur_rank*/ rank_,
       size_,
       phi::AllocationType::CUSTOM);
-  phi::DeviceManager::CCLReduceScatter(
-      place_.GetDeviceType(),
-      const_cast<void*>(in_tensor.data()),
-      out_tensor->data(),
-      out_tensor->numel(),
-      phi::ccl::ToCCLDataType(in_tensor.type()),
-      reduce_type,
-      xccl_comm_,
-      stream);
+  phi::DeviceManager::CCLReduceScatter(place_.GetDeviceType(),
+                                       const_cast<void*>(in_tensor.data()),
+                                       out_tensor->data(),
+                                       out_tensor->numel(),
+                                       in_tensor.dtype(),
+                                       reduce_type,
+                                       xccl_comm_,
+                                       stream);
 }
 
 void XCCLCommContext::Send(const phi::DenseTensor& in_tensor,
@@ -145,7 +144,7 @@ void XCCLCommContext::Send(const phi::DenseTensor& in_tensor,
   phi::DeviceManager::CCLSend(place_.GetDeviceType(),
                               const_cast<void*>(in_tensor.data()),
                               count,
-                              phi::ccl::ToCCLDataType(in_tensor.type()),
+                              in_tensor.dtype(),
                               peer,
                               xccl_comm_,
                               stream);
@@ -162,7 +161,7 @@ void XCCLCommContext::Recv(phi::DenseTensor* out_tensor,
   phi::DeviceManager::CCLRecv(place_.GetDeviceType(),
                               out_tensor->data(),
                               count,
-                              phi::ccl::ToCCLDataType(out_tensor->type()),
+                              out_tensor->dtype(),
                               peer,
                               xccl_comm_,
                               stream);
@@ -184,7 +183,7 @@ void XCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
                                    const_cast<void*>(in_tensor.data()),
                                    out_tensor->data(),
                                    in_tensor.numel(),
-                                   phi::ccl::ToCCLDataType(in_tensor.type()),
+                                   in_tensor.dtype(),
                                    reduce_type,
                                    xccl_comm_,
                                    stream);
@@ -205,7 +204,7 @@ void XCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
                                 const_cast<void*>(in_tensor.data()),
                                 out_tensor->data(),
                                 in_tensor.numel(),
-                                phi::ccl::ToCCLDataType(in_tensor.type()),
+                                in_tensor.dtype(),
                                 reduce_type,
                                 root,
                                 xccl_comm_,
diff --git a/paddle/phi/kernels/cpu/all_to_all_kernel.cc b/paddle/phi/kernels/cpu/all_to_all_kernel.cc
index 3407a1828e208..5df84c5360de7 100644
--- a/paddle/phi/kernels/cpu/all_to_all_kernel.cc
+++ b/paddle/phi/kernels/cpu/all_to_all_kernel.cc
@@ -45,8 +45,7 @@ void AllToAllKernel(const phi::CustomContext& dev_ctx,
 
   std::vector<void*> sendbuf, recvbuf;
   std::vector<size_t> sendsize(send_numel, nranks);
-  std::vector<phi::ccl::CCLDataType> sendtype(
-      phi::ccl::ToCCLDataType(x.dtype()), nranks);
+  std::vector<phi::DataType> sendtype(x.dtype(), nranks);
   for (auto i = 0; i < nranks; ++i) {
     sendbuf.push_back(x.data<T>() + i * send_numel);
     recvbuf.push_back(out->data<T>() + i * send_numel);
diff --git a/test/cpp/fluid/platform/device/custom/custom_device_test.cc b/test/cpp/fluid/platform/device/custom/custom_device_test.cc
index b36355b2386be..4f0ce796ad66b 100644
--- a/test/cpp/fluid/platform/device/custom/custom_device_test.cc
+++ b/test/cpp/fluid/platform/device/custom/custom_device_test.cc
@@ -183,18 +183,13 @@ void TestCustomCCL(const paddle::platform::Place& place) {
   phi::DeviceManager::CCLDestroyComm(dev_type, nullptr);
   phi::DeviceManager::CCLGetUniqueId(dev_type, &root_id);
   phi::DeviceManager::CCLCommInitRank(dev_type, 0, &root_id, 0, nullptr);
-  phi::DeviceManager::CCLBroadcast(dev_type,
-                                   nullptr,
-                                   0,
-                                   phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
-                                   0,
-                                   comm,
-                                   stream);
+  phi::DeviceManager::CCLBroadcast(
+      dev_type, nullptr, 0, phi::DataType::FLOAT32, 0, comm, stream);
   phi::DeviceManager::CCLAllReduce(dev_type,
                                    nullptr,
                                    nullptr,
                                    0,
-                                   phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+                                   phi::DataType::FLOAT32,
                                    phi::ccl::CCLReduceOp::SUM,
                                    comm,
                                    stream);
@@ -202,43 +197,27 @@ void TestCustomCCL(const paddle::platform::Place& place) {
                                 nullptr,
                                 nullptr,
                                 0,
-                                phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
+                                phi::DataType::FLOAT32,
                                 phi::ccl::CCLReduceOp::SUM,
                                 0,
                                 comm,
                                 stream);
-  phi::DeviceManager::CCLAllGather(dev_type,
-                                   nullptr,
-                                   nullptr,
-                                   0,
-                                   phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
-                                   comm,
-                                   stream);
-  phi::DeviceManager::CCLReduceScatter(
-      dev_type,
-      nullptr,
-      nullptr,
-      0,
-      phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
-      phi::ccl::CCLReduceOp::SUM,
-      comm,
-      stream);
+  phi::DeviceManager::CCLAllGather(
+      dev_type, nullptr, nullptr, 0, phi::DataType::FLOAT32, comm, stream);
+  phi::DeviceManager::CCLReduceScatter(dev_type,
+                                       nullptr,
+                                       nullptr,
+                                       0,
+                                       phi::DataType::FLOAT32,
+                                       phi::ccl::CCLReduceOp::SUM,
+                                       comm,
+                                       stream);
   phi::DeviceManager::CCLGroupStart(dev_type);
   phi::DeviceManager::CCLGroupEnd(dev_type);
-  phi::DeviceManager::CCLSend(dev_type,
-                              nullptr,
-                              0,
-                              phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
-                              0,
-                              comm,
-                              stream);
-  phi::DeviceManager::CCLRecv(dev_type,
-                              nullptr,
-                              0,
-                              phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32,
-                              0,
-                              comm,
-                              stream);
+  phi::DeviceManager::CCLSend(
+      dev_type, nullptr, 0, phi::DataType::FLOAT32, 0, comm, stream);
+  phi::DeviceManager::CCLRecv(
+      dev_type, nullptr, 0, phi::DataType::FLOAT32, 0, comm, stream);
 }
 
 TEST(CustomDevice, Tensor) {

From 046d70a52d079c9076b2dc709159ab7204057337 Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:06:21 +0800
Subject: [PATCH 051/114] fix grid dim error when launching kernel (#62483)

---
 paddle/cinn/common/integer_set.cc             | 44 ++++++++++---------
 .../tactic/tile_first_general_tactic.cc       | 22 ++++++++++
 2 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/paddle/cinn/common/integer_set.cc b/paddle/cinn/common/integer_set.cc
index f6d6446b9bb24..8c9998122373f 100644
--- a/paddle/cinn/common/integer_set.cc
+++ b/paddle/cinn/common/integer_set.cc
@@ -44,6 +44,9 @@ cas_intervals_t CollectVarIntervalsOfExprs(const std::vector<ir::Expr>& exprs,
         if (var->upper_bound.defined()) {
           upper_bound = var->upper_bound;
         }
+        if (var->is_symbolic_constant) {
+          lower_bound = ir::Expr(1);
+        }
         var_intervals.insert(
             {var->name, CasInterval(lower_bound, upper_bound)});
       }
@@ -118,25 +121,20 @@ std::optional<bool> SymbolicExprAnalyzer::ProveGE(const ir::Expr& lhs,
   if (lhs == rhs) {
     return true;
   }
-  if (lhs == SymbolicExprLimit::positive_inf ||
-      rhs == SymbolicExprLimit::negative_inf) {
-    return true;
-  }
   if (rhs == SymbolicExprLimit::positive_inf ||
       lhs == SymbolicExprLimit::negative_inf) {
     return false;
   }
-  ir::Expr diff = AutoSimplify(ir::Sub::Make(lhs, rhs), var_intervals_);
-  VLOG(6) << "diff of " << ir::Sub::Make(lhs, rhs) << " = " << diff;
-  if (diff.is_constant() && diff.get_constant() >= 0) {
+  if (lhs == SymbolicExprLimit::positive_inf ||
+      rhs == SymbolicExprLimit::negative_inf) {
     return true;
   }
+  ir::Expr diff = AutoSimplify(ir::Sub::Make(lhs, rhs), var_intervals_);
+  VLOG(6) << "diff of " << ir::Sub::Make(lhs, rhs) << " = " << diff;
   if (diff.is_constant() && diff.get_constant() < 0) {
     return false;
   }
-  ir::Expr diff_lower_bound = LowerBound(diff);
-  VLOG(6) << "lower bound of " << diff << " = " << diff_lower_bound;
-  if (diff_lower_bound.is_constant() && diff_lower_bound.get_constant() >= 0) {
+  if (diff.is_constant() && diff.get_constant() >= 0) {
     return true;
   }
   ir::Expr diff_upper_bound = UpperBound(diff);
@@ -144,6 +142,11 @@ std::optional<bool> SymbolicExprAnalyzer::ProveGE(const ir::Expr& lhs,
   if (diff_upper_bound.is_constant() && diff_upper_bound.get_constant() < 0) {
     return false;
   }
+  ir::Expr diff_lower_bound = LowerBound(diff);
+  VLOG(6) << "lower bound of " << diff << " = " << diff_lower_bound;
+  if (diff_lower_bound.is_constant() && diff_lower_bound.get_constant() >= 0) {
+    return true;
+  }
   return std::nullopt;
 }
 
@@ -157,25 +160,20 @@ std::optional<bool> SymbolicExprAnalyzer::ProveGT(const ir::Expr& lhs,
   if (lhs == rhs) {
     return false;
   }
-  if (lhs == SymbolicExprLimit::positive_inf ||
-      rhs == SymbolicExprLimit::negative_inf) {
-    return true;
-  }
   if (rhs == SymbolicExprLimit::positive_inf ||
       lhs == SymbolicExprLimit::negative_inf) {
     return false;
   }
-  ir::Expr diff = AutoSimplify(ir::Sub::Make(lhs, rhs), var_intervals_);
-  VLOG(6) << "diff of " << ir::Sub::Make(lhs, rhs) << " = " << diff;
-  if (diff.is_constant() && diff.get_constant() > 0) {
+  if (lhs == SymbolicExprLimit::positive_inf ||
+      rhs == SymbolicExprLimit::negative_inf) {
     return true;
   }
+  ir::Expr diff = AutoSimplify(ir::Sub::Make(lhs, rhs), var_intervals_);
+  VLOG(6) << "diff of " << ir::Sub::Make(lhs, rhs) << " = " << diff;
   if (diff.is_constant() && diff.get_constant() <= 0) {
     return false;
   }
-  ir::Expr diff_lower_bound = LowerBound(diff);
-  VLOG(6) << "lower bound of " << diff << " = " << diff_lower_bound;
-  if (diff_lower_bound.is_constant() && diff_lower_bound.get_constant() > 0) {
+  if (diff.is_constant() && diff.get_constant() > 0) {
     return true;
   }
   ir::Expr diff_upper_bound = UpperBound(diff);
@@ -183,6 +181,12 @@ std::optional<bool> SymbolicExprAnalyzer::ProveGT(const ir::Expr& lhs,
   if (diff_upper_bound.is_constant() && diff_upper_bound.get_constant() <= 0) {
     return false;
   }
+  ir::Expr diff_lower_bound = LowerBound(diff);
+  VLOG(6) << "lower bound of " << diff << " = " << diff_lower_bound;
+  if (diff_lower_bound.is_constant() && diff_lower_bound.get_constant() > 0) {
+    return true;
+  }
+
   return std::nullopt;
 }
 
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
index 95805490493ca..165242258ef1b 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
@@ -89,14 +89,36 @@ void TileFirstGeneralTactic::Apply(ir::IRSchedule* sch,
                                    const std::string& block_id) {
   if (ir::IsReduceInitTensorName(block_id)) return;
   MergeFlattenAxis(sch, block_id);
+  VLOG(6) << "After MergeFlattenAxis on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetLoops(block_id)[0];
   MergeReduceAxis(sch, block_id);
+  VLOG(6) << "After MergeReduceAxis on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetLoops(block_id)[0];
   SplitFlattenInner(sch, block_id);
+  VLOG(6) << "After SplitFlattenInner on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetLoops(block_id)[0];
   SplitReduceInner(sch, block_id);
+  VLOG(6) << "After SplitReduceInner on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetLoops(block_id)[0];
   ReorderFlattenInnerWithReduceAxis(sch, block_id);
+  VLOG(6) << "After ReorderFlattenInnerWithReduceAxis on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetLoops(block_id)[0];
   SplitWarpNumber(sch, block_id);
+  VLOG(6) << "After SplitWarpNumber on block: [" << block_id
+          << "], loop nest:\n"
+          << sch->GetLoops(block_id)[0];
   BindCudaInfo(sch, block_id);
+  VLOG(6) << "After BindCudaInfo on block: [" << block_id << "], loop nest:\n"
+          << sch->GetLoops(block_id)[0];
   VariableTypeAssignment(sch, block_id);
   Unroll(sch, block_id);
+  VLOG(6) << "After Unroll on block: [" << block_id << "], loop nest:\n"
+          << sch->GetLoops(block_id)[0];
   SetReduceType(sch, block_id);
 }
 

From 796431590006b38359cfdee37399f0805b03f12c Mon Sep 17 00:00:00 2001
From: Sonder <55493212+AndSonder@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:17:51 +0800
Subject: [PATCH 052/114] [AutoParallel] Change switch name to
 gradient_sync_after_accumulate (#62441)

* change switch name to gradient_sync_after_accumulate

* skip add none op when open gradient_sync_after_accumulate flag
---
 python/paddle/distributed/auto_parallel/constants.py  |  6 +++---
 .../auto_parallel/static/parallelizer_v2.py           | 11 +++++++----
 .../passes/auto_parallel_gradient_merge.py            | 10 +++++-----
 .../distributed/passes/auto_parallel_sharding.py      |  2 ++
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index e1191015fa305..9f3fc5d1fcc4a 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -105,9 +105,6 @@ def set_field_default_config(category, field, default_value):
 set_field_default_config(GRADIENT_MERGE, "enable", False)
 set_field_default_config(GRADIENT_MERGE, "k_steps", 1)
 set_field_default_config(GRADIENT_MERGE, "avg", True)
-set_field_default_config(
-    GRADIENT_MERGE, "dp_gradient_sync_after_accumulate", False
-)
 
 #########################################
 # pipeline configuration
@@ -174,6 +171,9 @@ def set_field_default_config(category, field, default_value):
 set_field_default_config(DP_OPTIMIZATION, "fuse_all_reduce_ops", True)
 set_field_default_config(DP_OPTIMIZATION, "fuse_grad_size_in_MB", 32)
 set_field_default_config(DP_OPTIMIZATION, "overlap_comm_cacl", True)
+set_field_default_config(
+    DP_OPTIMIZATION, "gradient_sync_after_accumulate", False
+)
 
 #########################################
 # model parallel configuration
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index 99a425614ff2a..d4671262bba62 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -416,10 +416,10 @@ def _apply_post_optimization(
             )
             dp_pass.apply([main_program], [startup_program], self._pass_context)
 
-        dp_gradient_sync_after_accumulate = (
-            self._strategy.gradient_merge.dp_gradient_sync_after_accumulate
+        gradient_sync_after_accumulate = (
+            self._strategy.dp_optimization.gradient_sync_after_accumulate
         )
-        if dp_gradient_sync_after_accumulate:
+        if gradient_sync_after_accumulate:
             global_params_grads = params_grads
 
         if self._strategy.sharding.enable:
@@ -427,6 +427,9 @@ def _apply_post_optimization(
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             config["global_rank"] = rank
+            config[
+                "gradient_sync_after_accumulate"
+            ] = gradient_sync_after_accumulate
             if self._strategy.amp.enable:
                 amp_config = copy.deepcopy(self._strategy.amp.to_dict())
                 config["amp_dtype"] = amp_config['dtype']
@@ -491,7 +494,7 @@ def _apply_post_optimization(
         if self.is_train and self._strategy.gradient_merge.enable:
             config = copy.deepcopy(self._strategy.gradient_merge.to_dict())
             config["dist_context"] = self._dist_context
-            if dp_gradient_sync_after_accumulate:
+            if gradient_sync_after_accumulate:
                 config["params_grads"] = global_params_grads
             else:
                 config["params_grads"] = params_grads
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index f5298782fc3ce..928e24da45615 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -445,7 +445,7 @@ def parse_program(
     k_steps,
     avg,
     dist_context,
-    dp_gradient_sync_after_accumulate,
+    gradient_sync_after_accumulate,
 ):
     # 1 remove optimizer_op from main_program
     optimize_ops_block = _remove_and_get_optimizer_op(
@@ -460,7 +460,7 @@ def parse_program(
         main_program, startup_program, params_grads, dist_context
     )
 
-    if dp_gradient_sync_after_accumulate:
+    if gradient_sync_after_accumulate:
         # 3 move reduce op to optimizer_ops_block
         optimize_ops_block = _move_reduce_to_optimizer_ops_block(
             main_program, optimize_ops_block, params_grads
@@ -505,8 +505,8 @@ def _apply_single_impl(self, main_program, startup_program, context):
         avg = self.get_attr("avg", False)
         dist_context = self.get_attr("dist_context")
         params_grads = self.get_attr("params_grads")
-        dp_gradient_sync_after_accumulate = self.get_attr(
-            "dp_gradient_sync_after_accumulate", False
+        gradient_sync_after_accumulate = self.get_attr(
+            "gradient_sync_after_accumulate", False
         )
         with paddle.static.program_guard(main_program, startup_program):
             parse_program(
@@ -516,7 +516,7 @@ def _apply_single_impl(self, main_program, startup_program, context):
                 k_steps,
                 avg,
                 dist_context,
-                dp_gradient_sync_after_accumulate,
+                gradient_sync_after_accumulate,
             )
 
         main_program._sync_with_cpp()
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index 8d1cf45eadaf9..bcf9326f37bd3 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -105,6 +105,7 @@ def __init__(self):
         self.set_attr("params_grads", [])
         self.set_attr("global_rank", -1)
         self.set_attr("amp_dtype", "float16")
+        self.set_attr("gradient_sync_after_accumulate", False)
         self.dp_groups = set()
         self.sharding_infos = []
         self.varname_to_sharding_info = {}
@@ -1334,6 +1335,7 @@ def _overlap_grad_comm(
                 if (
                     op.type == "c_reduce_avg"
                     and not grad_group.is_in_local_shard
+                    and not self.get_attr("gradient_sync_after_accumulate")
                 ):
                     if idx not in dep_map:
                         dep_map[idx] = []

From 93f29aa9320b8e144e2f9ec9364e893067481617 Mon Sep 17 00:00:00 2001
From: ming1753 <61511741+ming1753@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:27:44 +0800
Subject: [PATCH 053/114] fix bug (#62501)

---
 paddle/fluid/inference/api/analysis_predictor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index ef576b3527c3b..961c0e350be38 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1045,7 +1045,7 @@ bool AnalysisPredictor::PrepareExecutor() {
     }
   }
 
-  if (config_.enable_memory_optim_) {
+  if (config_.enable_memory_optim_ && !config_.use_optimized_model_) {
     auto *pass_res_info =
         inference::analysis::PassResultInfoForRuntime::Instance();
     auto reuse_table =

From 6a9d40bef5f325651110320346b67b4a3cada92b Mon Sep 17 00:00:00 2001
From: cyberslack_lee <jeffrey0122@163.com>
Date: Thu, 7 Mar 2024 14:55:40 +0800
Subject: [PATCH 054/114] =?UTF-8?q?=E3=80=90PIR=20Dist=20Op=20Reg=20No.16?=
 =?UTF-8?q?=E3=80=91=20reg=20=20c=5Fsplit=20(#62416)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix

* fix

* fix

* fix
---
 .../pir/dialect/op_generator/ops_api_gen.py   |  1 +
 paddle/fluid/pir/dialect/operator/ir/ops.yaml |  9 ++++
 .../fluid/pir/dialect/operator/utils/utils.cc |  1 +
 paddle/phi/api/yaml/op_compat.yaml            |  6 +++
 paddle/phi/infermeta/unary.cc                 |  9 ++++
 paddle/phi/infermeta/unary.h                  |  2 +
 test/ir/pir/translator/CMakeLists.txt         |  1 +
 .../pir/translator/test_c_split_translator.py | 48 +++++++++++++++++++
 8 files changed, 77 insertions(+)
 create mode 100644 test/ir/pir/translator/test_c_split_translator.py

diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 638f13fd729a8..a9d29bb97da08 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -134,6 +134,7 @@
     'c_reduce_sum',
     'c_reducescatter',
     'c_softmax_with_cross_entropy',
+    'c_split',
     'decayed_adagrad',
     'distributed_lookup_table',
     'dpsgd',
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 9cc328dbe24fb..9d2ee247d72c7 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -292,6 +292,15 @@
     func : reduce_scatter
     param: [x, nranks]
 
+- op : c_split
+  args : (Tensor x, int rank = 0, int nranks = 1, int ring_id = 0, bool use_calc_stream = false, bool use_model_parallel = true)
+  output : Tensor(out)
+  infer_meta :
+    func : CSplitInferMeta
+    param : [x, nranks]
+  kernel :
+    func : c_split
+
 - op : c_sync_calc_stream
   args : (Tensor x)
   output : Tensor(out)
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index 9a9df1fed3cdd..f7bdfabcbf75b 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -59,6 +59,7 @@ const std::unordered_set<std::string> LegacyOpList = {
     CAllgatherOp::name(),
     CSoftmaxWithCrossEntropyOp::name(),
     CSoftmaxWithCrossEntropyGradOp::name(),
+    CSplitOp::name(),
     SeedOp::name(),
     ShareDataOp::name(),
     SparseMomentumOp::name(),
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 2c6129c30fb81..eb154cbfa1a92 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -471,6 +471,12 @@
   outputs :
     {softmax : Softmax, loss : Loss}
 
+- op : c_split
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
 - op : cast
   inputs :
     x : X
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 5596b9bb798e9..11cd3f4e45d26 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -738,6 +738,15 @@ void CropInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void CSplitInferMeta(const MetaTensor& x, int nranks, MetaTensor* out) {
+  phi::DDim dim = x.dims();
+  dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
+  if (dim[0] < 0) dim[0] = -1;
+  out->set_dims(dim);
+  out->set_layout(x.layout());
+  out->set_dtype(x.dtype());
+}
+
 void DecodeJpegInferMeta(const MetaTensor& x,
                          const std::string& mode,
                          MetaTensor* out) {
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index d62789bd5183c..63e7c1fd3cf31 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -137,6 +137,8 @@ void CropInferMeta(const MetaTensor& x,
                    MetaTensor* out,
                    MetaConfig config = MetaConfig());
 
+void CSplitInferMeta(const MetaTensor& x, int nranks, MetaTensor* out);
+
 void CumInferMeta(const MetaTensor& x,
                   int axis,
                   bool flatten,
diff --git a/test/ir/pir/translator/CMakeLists.txt b/test/ir/pir/translator/CMakeLists.txt
index b7fd892ea35a5..01282d80f1723 100644
--- a/test/ir/pir/translator/CMakeLists.txt
+++ b/test/ir/pir/translator/CMakeLists.txt
@@ -7,6 +7,7 @@ string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
 set(DISTRIBUTED_OP_TRANSLATOR_TEST test_c_reduce_min_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_c_allreduce_min_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_c_allreduce_prod_translator)
+list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_c_split_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST
      test_distributed_lookup_table_translate)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_distributed_fused_lamb_init)
diff --git a/test/ir/pir/translator/test_c_split_translator.py b/test/ir/pir/translator/test_c_split_translator.py
new file mode 100644
index 0000000000000..e09194e9ca019
--- /dev/null
+++ b/test/ir/pir/translator/test_c_split_translator.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import test_op_translator
+
+import paddle
+from paddle.base.layer_helper import LayerHelper
+
+
+class TestCSplitOpTranslator(test_op_translator.TestOpTranslator):
+    def append_op(self):
+        self.op_type = "c_split"
+        x = paddle.ones(shape=(100, 2, 2), dtype='float32')
+        y = paddle.ones(shape=(100, 2, 2), dtype='float32')
+        attrs = {
+            'rank': 0,
+            'nranks': 2,
+            'ring_id': 0,
+            'use_calc_stream': False,
+            'use_model_parallel': True,
+        }
+        helper = LayerHelper(self.op_type)
+        helper.append_op(
+            type=self.op_type,
+            inputs={"X": x},
+            outputs={"Out": y},
+            attrs=attrs,
+        )
+
+    def test_translator(self):
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()

From a726f8253ac042fcf0ebe8519e73d5c8d13d8b14 Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:58:36 +0800
Subject: [PATCH 055/114] [PIR] move pir::DenseTensorType registration from
 OperatorDialect to BuiltinDialect (#62491)

---
 .../pir/dialect/operator/ir/op_dialect.cc     |  42 +-
 .../pir/dialect/operator/ir/op_dialect.h      |   1 -
 .../dialect/operator/ir/op_onednn_dialect.cc  |  39 +-
 .../dialect/operator/ir/op_onednn_dialect.h   |   1 -
 paddle/fluid/pybind/pir.cc                    |  18 +-
 paddle/pir/include/core/builtin_dialect.h     |   7 +-
 paddle/pir/src/core/builtin_dialect.cc        |  52 +-
 test/cpp/pir/core/TestParserText.txt          |   8 +-
 test/cpp/pir/core/add_dialect_parser_test.cc  |   2 +-
 test/ir/pir/cinn/symbolic/simple_llama.config | 500 +++++++++---------
 .../symbolic/test_llama_group_log_softmax.py  |   2 +-
 test/ir/pir/test_ir_pybind.py                 |   5 +-
 .../test_fused_rotary_position_embedding.py   |   4 +-
 13 files changed, 328 insertions(+), 353 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 6816d64a05467..7262589c7ad3a 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -205,15 +205,7 @@ OperatorDialect::OperatorDialect(pir::IrContext* ctx)
 void PrintTypeImpl(pir::Type type, std::ostream& os) {
   os << type.dialect().name();
   os << '.';
-  if (auto tensor_type = type.dyn_cast<DenseTensorType>()) {
-    os << "tensor<";
-    for (auto d : common::vectorize(tensor_type.dims())) {
-      os << d;
-      os << "x";
-    }
-    tensor_type.dtype().Print(os);
-    os << ">";
-  } else if (auto selected_rows_type = type.dyn_cast<SelectedRowsType>()) {
+  if (auto selected_rows_type = type.dyn_cast<SelectedRowsType>()) {
     os << "selectedrows<";
     for (auto d : common::vectorize(selected_rows_type.dims())) {
       os << d;
@@ -266,8 +258,7 @@ void PrintOperationImpl(pir::Operation* op,
 }
 
 void OperatorDialect::initialize() {
-  RegisterTypes<paddle::dialect::DenseTensorType,
-                paddle::dialect::SelectedRowsType,
+  RegisterTypes<paddle::dialect::SelectedRowsType,
                 paddle::dialect::DenseTensorArrayType>();
 
   RegisterAttributes<paddle::dialect::IntArrayAttribute,
@@ -328,35 +319,6 @@ void OperatorDialect::PrintAttribute(pir::Attribute attr,
   PrintAttributeImpl(attr, os);
 }
 
-pir::Type OperatorDialect::ParseType(pir::IrParser& parser) {  // NOLINT
-  parser.ConsumeAToken("pd_op.tensor");
-  parser.ConsumeAToken("<");
-  std::vector<int> dim{};
-  Token dim_token = parser.PeekToken();
-  while (dim_token.token_type_ == DIGIT) {
-    dim_token = parser.ConsumeToken();
-    dim.push_back(atoi(dim_token.val_.c_str()));
-    std::string peek_token_val = parser.PeekToken().val_;
-    if (peek_token_val[0] != 'x') {
-      break;
-    }
-    parser.ConsumeToken();
-    parser.lexer->Unget(static_cast<int>(peek_token_val.size() - 1));
-    if (parser.PeekToken().token_type_ != DIGIT) {
-      break;
-    }
-  }
-  phi::DDim ddim = common::make_ddim(dim);
-  pir::Type dtype = parser.ParseType();
-  std::vector<std::vector<size_t>> lod;
-  std::vector<size_t> lodv;
-  lodv.push_back(0);
-  lod.push_back(lodv);
-  parser.ConsumeAToken(">");
-  return DenseTensorType::get(
-      parser.ctx, dtype, ddim, phi::DataLayout::UNDEFINED, lod, 0);
-}
-
 pir::Attribute OperatorDialect::ParseAttribute(
     pir::IrParser& parser) {  // NOLINT
   std::string type_name = parser.ConsumeToken().val_;
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.h b/paddle/fluid/pir/dialect/operator/ir/op_dialect.h
index ae7dc883f8911..deda7b3ddcdd0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.h
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.h
@@ -29,7 +29,6 @@ class TEST_API OperatorDialect : public pir::Dialect {
 
   static const char* name() { return "pd_op"; }
 
-  pir::Type ParseType(pir::IrParser& parser) override;            // NOLINT
   pir::Attribute ParseAttribute(pir::IrParser& parser) override;  // NOLINT
 
   void PrintType(pir::Type type, std::ostream& os) const override;
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.cc
index 5b7323264c626..8ea9f0a7ce02f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.cc
@@ -68,15 +68,7 @@ void OneDNNOperatorDialect::initialize() {
 void OneDNNOperatorDialect::PrintType(pir::Type type, std::ostream &os) const {
   os << type.dialect().name();
   os << '.';
-  if (auto tensor_type = type.dyn_cast<DenseTensorType>()) {
-    os << "tensor<";
-    for (auto d : common::vectorize(tensor_type.dims())) {
-      os << d;
-      os << "x";
-    }
-    tensor_type.dtype().Print(os);
-    os << ">";
-  } else if (auto selected_rows_type = type.dyn_cast<SelectedRowsType>()) {
+  if (auto selected_rows_type = type.dyn_cast<SelectedRowsType>()) {
     os << "selectedrows<";
     for (auto d : common::vectorize(selected_rows_type.dims())) {
       os << d;
@@ -117,35 +109,6 @@ void OneDNNOperatorDialect::PrintAttribute(pir::Attribute attr,
   }
 }
 
-pir::Type OneDNNOperatorDialect::ParseType(pir::IrParser &parser) {  // NOLINT
-  parser.ConsumeAToken("pd_op.tensor");
-  parser.ConsumeAToken("<");
-  std::vector<int> dim{};
-  Token dim_token = parser.PeekToken();
-  while (dim_token.token_type_ == DIGIT) {
-    dim_token = parser.ConsumeToken();
-    dim.push_back(atoi(dim_token.val_.c_str()));
-    std::string peek_token_val = parser.PeekToken().val_;
-    if (peek_token_val[0] != 'x') {
-      break;
-    }
-    parser.ConsumeToken();
-    parser.lexer->Unget(static_cast<int>(peek_token_val.size() - 1));
-    if (parser.PeekToken().token_type_ != DIGIT) {
-      break;
-    }
-  }
-  phi::DDim ddim = common::make_ddim(dim);
-  pir::Type dtype = parser.ParseType();
-  std::vector<std::vector<size_t>> lod;
-  std::vector<size_t> lodv;
-  lodv.push_back(0);
-  lod.push_back(lodv);
-  parser.ConsumeAToken(">");
-  return DenseTensorType::get(
-      parser.ctx, dtype, ddim, phi::DataLayout::UNDEFINED, lod, 0);
-}
-
 pir::Attribute OneDNNOperatorDialect::ParseAttribute(
     pir::IrParser &parser) {  // NOLINT
   std::string type_name = parser.ConsumeToken().val_;
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h b/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h
index 405c9346e2fa8..6ef33672c9c96 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h
+++ b/paddle/fluid/pir/dialect/operator/ir/op_onednn_dialect.h
@@ -25,7 +25,6 @@ class OneDNNOperatorDialect : public pir::Dialect {
 
   static const char* name() { return "onednn_op"; }
 
-  pir::Type ParseType(pir::IrParser& parser) override;            // NOLINT
   pir::Attribute ParseAttribute(pir::IrParser& parser) override;  // NOLINT
 
   void PrintType(pir::Type type, std::ostream& os) const override;
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index b76e23fe53eef..6301c1f99a434 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -1543,10 +1543,10 @@ void BindUtils(pybind11::module *m) {
 
                 >>> print(pir_program)
                 {
-                 (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float32,is_persistable:[false],name:"x",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4,4],stop_gradient:[false]} : () -> pd_op.tensor<4x4xf32>
-                 (%1) = "pd_op.matmul" (%0, %0) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<4x4xf32>, pd_op.tensor<4x4xf32>) -> pd_op.tensor<4x4xf32>
-                 (%2) = "pd_op.add" (%1, %1) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4x4xf32>, pd_op.tensor<4x4xf32>) -> pd_op.tensor<4x4xf32>
-                 (%3) = "pd_op.tanh" (%2) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4x4xf32>) -> pd_op.tensor<4x4xf32>
+                 (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float32,is_persistable:[false],name:"x",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4,4],stop_gradient:[false]} : () -> builtin.tensor<4x4xf32>
+                 (%1) = "pd_op.matmul" (%0, %0) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<4x4xf32>, builtin.tensor<4x4xf32>) -> builtin.tensor<4x4xf32>
+                 (%2) = "pd_op.add" (%1, %1) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<4x4xf32>, builtin.tensor<4x4xf32>) -> builtin.tensor<4x4xf32>
+                 (%3) = "pd_op.tanh" (%2) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<4x4xf32>) -> builtin.tensor<4x4xf32>
                 }
 
 
@@ -1618,14 +1618,14 @@ void BindUtils(pybind11::module *m) {
 
                 >>> print(pir_program)
                 {
-                 (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float32,is_persistable:[false],name:"x",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4,4],stop_gradient:[false]} : () -> pd_op.tensor<4x4xf32>
-                 (%1) = "pd_op.matmul" (%0, %0) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<4x4xf32>, pd_op.tensor<4x4xf32>) -> pd_op.tensor<4x4xf32>
-                 (%2) = "pd_op.add" (%1, %1) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4x4xf32>, pd_op.tensor<4x4xf32>) -> pd_op.tensor<4x4xf32>
-                 (%3) = "pd_op.tanh" (%2) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4x4xf32>) -> pd_op.tensor<4x4xf32>
+                 (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float32,is_persistable:[false],name:"x",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4,4],stop_gradient:[false]} : () -> builtin.tensor<4x4xf32>
+                 (%1) = "pd_op.matmul" (%0, %0) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<4x4xf32>, builtin.tensor<4x4xf32>) -> builtin.tensor<4x4xf32>
+                 (%2) = "pd_op.add" (%1, %1) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<4x4xf32>, builtin.tensor<4x4xf32>) -> builtin.tensor<4x4xf32>
+                 (%3) = "pd_op.tanh" (%2) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<4x4xf32>) -> builtin.tensor<4x4xf32>
                 }
 
                 >>> print(mappings)
-                {'matmul_v2_0.tmp_0': [Value(define_op_name=pd_op.matmul, index=0, dtype=pd_op.tensor<4x4xf32>)], 'x': [Value(define_op_name=pd_op.data, index=0, dtype=pd_op.tensor<4x4xf32>)], 'tanh_0.tmp_0': [Value(define_op_name=pd_op.tanh, index=0, dtype=pd_op.tensor<4x4xf32>)], 'elementwise_add_0': [Value(define_op_name=pd_op.add, index=0, dtype=pd_op.tensor<4x4xf32>)]}
+                {'matmul_v2_0.tmp_0': [Value(define_op_name=pd_op.matmul, index=0, dtype=builtin.tensor<4x4xf32>)], 'x': [Value(define_op_name=pd_op.data, index=0, dtype=builtin.tensor<4x4xf32>)], 'tanh_0.tmp_0': [Value(define_op_name=pd_op.tanh, index=0, dtype=builtin.tensor<4x4xf32>)], 'elementwise_add_0': [Value(define_op_name=pd_op.add, index=0, dtype=builtin.tensor<4x4xf32>)]}
     )DOC");
 
   m->def("clear_pir_compiler_manager", []() {
diff --git a/paddle/pir/include/core/builtin_dialect.h b/paddle/pir/include/core/builtin_dialect.h
index 1203cdec9d283..193141750283c 100644
--- a/paddle/pir/include/core/builtin_dialect.h
+++ b/paddle/pir/include/core/builtin_dialect.h
@@ -24,14 +24,17 @@ namespace pir {
 ///
 class IR_API BuiltinDialect : public pir::Dialect {
  public:
-  explicit BuiltinDialect(pir::IrContext *context);
+  explicit BuiltinDialect(pir::IrContext* context);
   ///
   /// \brief Each Dialect needs to provide a name function to return the name of
   /// the Dialect.
   ///
   /// \return The name of this Dialect.
   ///
-  static const char *name() { return "builtin"; }
+  static const char* name() { return "builtin"; }
+
+  pir::Type ParseType(pir::IrParser& parser) override;  // NOLINT
+  void PrintType(pir::Type type, std::ostream& os) const override;
 
  private:
   void initialize();
diff --git a/paddle/pir/src/core/builtin_dialect.cc b/paddle/pir/src/core/builtin_dialect.cc
index 8b450ffbc1d09..db4fc1808c300 100644
--- a/paddle/pir/src/core/builtin_dialect.cc
+++ b/paddle/pir/src/core/builtin_dialect.cc
@@ -13,12 +13,16 @@
 // limitations under the License.
 
 #include "paddle/pir/include/core/builtin_dialect.h"
+
+#include "paddle/common/ddim.h"
+#include "paddle/common/layout.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/builtin_op.h"
 #include "paddle/pir/include/core/builtin_type.h"
+#include "paddle/pir/include/core/parser/ir_parser.h"
 
 namespace pir {
-BuiltinDialect::BuiltinDialect(IrContext *context)
+BuiltinDialect::BuiltinDialect(IrContext* context)
     : Dialect(name(), context, TypeId::get<BuiltinDialect>()) {
   initialize();
 }
@@ -38,7 +42,8 @@ void BuiltinDialect::initialize() {
                 BoolType,
                 Complex64Type,
                 Complex128Type,
-                VectorType>();
+                VectorType,
+                DenseTensorType>();
 
   RegisterAttributes<StrAttribute,
                      BoolAttribute,
@@ -64,6 +69,49 @@ void BuiltinDialect::initialize() {
               ConstantOp>();
 }
 
+pir::Type BuiltinDialect::ParseType(pir::IrParser& parser) {  // NOLINT
+  parser.ConsumeAToken("builtin.tensor");
+  parser.ConsumeAToken("<");
+  std::vector<int> dim{};
+  Token dim_token = parser.PeekToken();
+  while (dim_token.token_type_ == DIGIT) {
+    dim_token = parser.ConsumeToken();
+    dim.push_back(atoi(dim_token.val_.c_str()));
+    std::string peek_token_val = parser.PeekToken().val_;
+    if (peek_token_val[0] != 'x') {
+      break;
+    }
+    parser.ConsumeToken();
+    parser.lexer->Unget(static_cast<int>(peek_token_val.size() - 1));
+    if (parser.PeekToken().token_type_ != DIGIT) {
+      break;
+    }
+  }
+  pir::DDim ddim = common::make_ddim(dim);
+  pir::Type dtype = parser.ParseType();
+  std::vector<std::vector<size_t>> lod;
+  std::vector<size_t> lodv;
+  lodv.push_back(0);
+  lod.push_back(lodv);
+  parser.ConsumeAToken(">");
+  return DenseTensorType::get(
+      parser.ctx, dtype, ddim, pir::DataLayout::UNDEFINED, lod, 0);
+}
+
+void BuiltinDialect::PrintType(pir::Type type, std::ostream& os) const {
+  os << type.dialect().name();
+  os << '.';
+  if (auto tensor_type = type.dyn_cast<DenseTensorType>()) {
+    os << "tensor<";
+    for (auto d : common::vectorize(tensor_type.dims())) {
+      os << d;
+      os << "x";
+    }
+    tensor_type.dtype().Print(os);
+    os << ">";
+  }
+}
+
 }  // namespace pir
 
 IR_DEFINE_EXPLICIT_TYPE_ID(pir::BuiltinDialect)
diff --git a/test/cpp/pir/core/TestParserText.txt b/test/cpp/pir/core/TestParserText.txt
index 10737e3108eb0..275520daeb964 100644
--- a/test/cpp/pir/core/TestParserText.txt
+++ b/test/cpp/pir/core/TestParserText.txt
@@ -27,14 +27,14 @@ f32
 //END
 
 //CHECK type
-pd_op.tensor<256xf32>
+builtin.tensor<256xf32>
 //END
 
 //CHECK program
 {
- (%0) = "builtin.parameter" () {parameter_name:"conv2d_0.w_0"} : () -> pd_op.tensor<64x3x7x7xf32>
- (%1) = "pd_op.feed" () {col:(Int32)0,is_persistable:[false],name:"data",stop_gradient:[true]} : () -> pd_op.tensor<-1x3x224x224xf32>
- (%2) = "pd_op.conv2d" (%1, %0) {data_format:"NCHW",dilations:[(Int32)1,(Int32)1],groups:(Int32)1,is_persistable:[false],padding_algorithm:"EXPLICIT",paddings:[(Int32)3,(Int32)3],stop_gradient:[false],strides:[(Int32)2,(Int32)2]} : (pd_op.tensor<-1x3x224x224xf32>, pd_op.tensor<64x3x7x7xf32>) -> pd_op.tensor<-1x64x112x112xf32>
+ (%0) = "builtin.parameter" () {parameter_name:"conv2d_0.w_0"} : () -> builtin.tensor<64x3x7x7xf32>
+ (%1) = "pd_op.feed" () {col:(Int32)0,is_persistable:[false],name:"data",stop_gradient:[true]} : () -> builtin.tensor<-1x3x224x224xf32>
+ (%2) = "pd_op.conv2d" (%1, %0) {data_format:"NCHW",dilations:[(Int32)1,(Int32)1],groups:(Int32)1,is_persistable:[false],padding_algorithm:"EXPLICIT",paddings:[(Int32)3,(Int32)3],stop_gradient:[false],strides:[(Int32)2,(Int32)2]} : (builtin.tensor<-1x3x224x224xf32>, builtin.tensor<64x3x7x7xf32>) -> builtin.tensor<-1x64x112x112xf32>
 }
 //END
 
diff --git a/test/cpp/pir/core/add_dialect_parser_test.cc b/test/cpp/pir/core/add_dialect_parser_test.cc
index 1b6ae533ffa16..7a84ac142c750 100644
--- a/test/cpp/pir/core/add_dialect_parser_test.cc
+++ b/test/cpp/pir/core/add_dialect_parser_test.cc
@@ -102,7 +102,7 @@ TEST(IrParserTest, AddAttribute) {
   std::string op_str =
       "(%0) = \"builtin.parameter\" () "
       "{parameter_name:\"conv2d_0.w_0\",test:(tp.char)a} : () -> "
-      "pd_op.tensor<64x3x7x7xf32>";
+      "builtin.tensor<64x3x7x7xf32>";
   std::stringstream ss;
   ss << op_str;
   pir::IrParser* parser = new pir::IrParser(ctx, ss);
diff --git a/test/ir/pir/cinn/symbolic/simple_llama.config b/test/ir/pir/cinn/symbolic/simple_llama.config
index ef3193a8cc735..1e80f206a970d 100644
--- a/test/ir/pir/cinn/symbolic/simple_llama.config
+++ b/test/ir/pir/cinn/symbolic/simple_llama.config
@@ -1,252 +1,252 @@
 {
-    (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"embedding_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[32000,4096],stop_gradient:[true]} : () -> pd_op.tensor<32000x4096xf16>
-    (%1) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> pd_op.tensor<4096xf16>
-    (%2) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> pd_op.tensor<4096x4096xf16>
-    (%3) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_1.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> pd_op.tensor<4096x4096xf16>
-    (%4) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_2.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> pd_op.tensor<4096x4096xf16>
-    (%5) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"eager_tmp_1",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1,2048,1,128],stop_gradient:[true]} : () -> pd_op.tensor<1x2048x1x128xf32>
-    (%6) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"eager_tmp_2",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1,2048,1,128],stop_gradient:[true]} : () -> pd_op.tensor<1x2048x1x128xf32>
-    (%7) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_3.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> pd_op.tensor<4096x4096xf16>
-    (%8) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_1.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> pd_op.tensor<4096xf16>
-    (%9) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_4.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,11008],stop_gradient:[true]} : () -> pd_op.tensor<4096x11008xf16>
-    (%10) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_5.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,11008],stop_gradient:[true]} : () -> pd_op.tensor<4096x11008xf16>
-    (%11) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_6.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[11008,4096],stop_gradient:[true]} : () -> pd_op.tensor<11008x4096xf16>
-    (%12) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_2.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> pd_op.tensor<4096xf16>
-    (%13) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"llama_lm_head_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,32000],stop_gradient:[true]} : () -> pd_op.tensor<4096x32000xf16>
-    (%14) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"top_p",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[true]} : () -> pd_op.tensor<1xf32>
-    (%15) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"position_ids",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> pd_op.tensor<-1x-1xi64>
-    (%16) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"attention_mask",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> pd_op.tensor<-1x-1xi64>
-    (%17) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"input_ids",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> pd_op.tensor<-1x-1xi64>
-    (%18) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
-    (%19) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
-    (%20) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%21) = "pd_op.slice" (%18, %19, %20) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%22) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%23) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%24) = "pd_op.slice" (%18, %22, %23) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%25) = "pd_op.cast" (%24) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<i32>) -> pd_op.tensor<i64>
-    (%26) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%27) = "pd_op.full_with_tensor" (%26, %25) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>, pd_op.tensor<i64>) -> pd_op.tensor<1xi64>
-    (%28) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
-    (%29) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%30) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%31) = "pd_op.slice" (%28, %29, %30) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%32) = "pd_op.cast" (%31) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<i32>) -> pd_op.tensor<i64>
-    (%33) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%34) = "pd_op.full_with_tensor" (%33, %32) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>, pd_op.tensor<i64>) -> pd_op.tensor<1xi64>
-    (%35) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<i32>
-    (%36) = "builtin.combine" (%21, %35) {} : (pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<i32>]
-    (%37) = "pd_op.stack" (%36) {axis:(Int32)0,stop_gradient:[true]} : (vec[pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<2xi32>
-    (%38) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
-    (%39) = "pd_op.full_with_tensor" (%37, %38) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1xb>
-    (%40) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<i32>
-    (%41) = "builtin.combine" (%21, %40) {} : (pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<i32>]
-    (%42) = "pd_op.stack" (%41) {axis:(Int32)0,stop_gradient:[true]} : (vec[pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<2xi32>
-    (%43) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
-    (%44) = "pd_op.full_with_tensor" (%42, %43) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1xf16>
-    (%45) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
-    (%46) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%47) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%48) = "pd_op.slice" (%45, %46, %47) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%49) = "pd_op.embedding" (%17, %0) {is_persistable:[false],padding_idx:(Int64)-1,sparse:false,stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<32000x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%50) = "pd_op.shape" (%16) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
-    (%51) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
-    (%52) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%53) = "pd_op.slice" (%50, %51, %52) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%54) = "pd_op.shape" (%16) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<2xi32>
-    (%55) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%56) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%57) = "pd_op.slice" (%54, %55, %56) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%58) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1,(Int64)2]} : () -> pd_op.tensor<2xi64>
-    (%59, %60) = "pd_op.unsqueeze" (%16, %58) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<2xi64>) -> pd_op.tensor<-1x1x1x-1xi64>, <<NULL TYPE>>
-    (%61) = "pd_op.cast" (%59) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x1x-1xi64>) -> pd_op.tensor<-1x1x1x-1xb>
-    (%62) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<1xi32>
-    (%63) = "builtin.combine" (%53, %62, %48, %57) {} : (pd_op.tensor<i32>, pd_op.tensor<1xi32>, pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<1xi32>,pd_op.tensor<i32>,pd_op.tensor<i32>]
-    (%64) = "pd_op.expand" (%61, %63) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x1x-1xb>, vec[pd_op.tensor<i32>,pd_op.tensor<1xi32>,pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<-1x1x-1x-1xb>
-    (%65) = "pd_op.full" () {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)0} : () -> pd_op.tensor<1xf64>
-    (%66) = "pd_op.full" () {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)-65504} : () -> pd_op.tensor<1xf64>
-    (%67) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
-    (%68) = "pd_op.full_like" (%65, %67) {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<1xf32>) -> pd_op.tensor<1xf64>
-    (%69) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
-    (%70) = "pd_op.full_like" (%66, %69) {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<1xf32>) -> pd_op.tensor<1xf64>
-    (%71) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
-    (%72) = "pd_op.full_like" (%64, %71) {dtype:(pd_op.DataType)bool,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xb>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1x-1x-1xb>
-    (%73) = "pd_op.cast" (%72) {dtype:(pd_op.DataType)float64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xb>) -> pd_op.tensor<-1x1x-1x-1xf64>
-    (%74) = "pd_op.cast" (%64) {dtype:(pd_op.DataType)float64,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xb>) -> pd_op.tensor<-1x1x-1x-1xf64>
-    (%75) = "pd_op.add" (%68, %70) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<1xf64>) -> pd_op.tensor<1xf64>
-    (%76) = "pd_op.add" (%75, %73) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
-    (%77) = "pd_op.add" (%65, %76) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
-    (%78) = "pd_op.add" (%66, %76) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
-    (%79) = "pd_op.add" (%74, %76) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
-    (%80) = "pd_op.cast" (%79) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xb>
-    (%81) = "pd_op.where" (%80, %77, %78) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xb>, pd_op.tensor<-1x1x-1x-1xf64>, pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf64>
-    (%82) = "pd_op.cast" (%81) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1x-1x-1xf64>) -> pd_op.tensor<-1x1x-1x-1xf16>
-    (%83) = "pd_op.cast" (%49) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%84) = "pd_op.pow" (%83) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%85) = "pd_op.mean" (%84) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%86) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
-    (%87) = "pd_op.scale" (%85, %86) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%88) = "pd_op.rsqrt" (%87) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%89) = "pd_op.multiply" (%88, %83) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%90) = "pd_op.cast" (%89) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%91) = "pd_op.multiply" (%90, %1) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%92) = "pd_op.matmul" (%91, %2) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%93) = "pd_op.matmul" (%91, %3) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%94) = "pd_op.matmul" (%91, %4) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%95) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> pd_op.tensor<4xi64>
-    (%96, %97) = "pd_op.reshape" (%92, %95) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4xi64>) -> pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<0x-1x-1x4096xf16>
-    (%98) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> pd_op.tensor<4xi64>
-    (%99, %100) = "pd_op.reshape" (%93, %98) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4xi64>) -> pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<0x-1x-1x4096xf16>
-    (%101) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> pd_op.tensor<4xi64>
-    (%102, %103) = "pd_op.reshape" (%94, %101) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4xi64>) -> pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<0x-1x-1x4096xf16>
-    (%104) = "pd_op.shape" (%99) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<4xi32>
-    (%105) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%106) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%107) = "pd_op.slice" (%104, %105, %106) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%108) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
-    (%109) = "builtin.combine" (%107) {} : (pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>]
-    (%110) = "pd_op.slice" (%5, %108, %109) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1x2048x1x128xf32>, pd_op.tensor<1xi64>, vec[pd_op.tensor<i32>]) -> pd_op.tensor<1x-1x1x128xf32>
-    (%111) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
-    (%112) = "builtin.combine" (%107) {} : (pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>]
-    (%113) = "pd_op.slice" (%6, %111, %112) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1x2048x1x128xf32>, pd_op.tensor<1xi64>, vec[pd_op.tensor<i32>]) -> pd_op.tensor<1x-1x1x128xf32>
-    (%114) = "pd_op.cast" (%110) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1x-1x1x128xf32>) -> pd_op.tensor<1x-1x1x128xf16>
-    (%115) = "pd_op.cast" (%113) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1x-1x1x128xf32>) -> pd_op.tensor<1x-1x1x128xf16>
-    (%116) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)2]} : () -> pd_op.tensor<2xi64>
-    (%117, %118) = "pd_op.squeeze" (%114, %116) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<1x-1x1x128xf16>, pd_op.tensor<2xi64>) -> pd_op.tensor<-1x128xf16>, <<NULL TYPE>>
-    (%119) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)2]} : () -> pd_op.tensor<2xi64>
-    (%120, %121) = "pd_op.squeeze" (%115, %119) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<1x-1x1x128xf16>, pd_op.tensor<2xi64>) -> pd_op.tensor<-1x128xf16>, <<NULL TYPE>>
-    (%122) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> pd_op.tensor<1xi64>
-    (%123, %124) = "pd_op.unsqueeze" (%15, %122) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x1xi64>, <<NULL TYPE>>
-    (%125) = "pd_op.gather_nd" (%117, %123) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x128xf16>, pd_op.tensor<-1x-1x1xi64>) -> pd_op.tensor<-1x-1x128xf16>
-    (%126) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%127, %128) = "pd_op.unsqueeze" (%125, %126) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x128xf16>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x1x128xf16>, <<NULL TYPE>>
-    (%129) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> pd_op.tensor<1xi64>
-    (%130, %131) = "pd_op.unsqueeze" (%15, %129) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x1xi64>, <<NULL TYPE>>
-    (%132) = "pd_op.gather_nd" (%120, %130) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x128xf16>, pd_op.tensor<-1x-1x1xi64>) -> pd_op.tensor<-1x-1x128xf16>
-    (%133) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%134, %135) = "pd_op.unsqueeze" (%132, %133) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x128xf16>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x1x128xf16>, <<NULL TYPE>>
-    (%136) = "pd_op.multiply" (%96, %127) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%137) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
-    (%138) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> pd_op.tensor<1xi64>
-    (%139) = "pd_op.slice" (%96, %137, %138) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x32x64xf16>
-    (%140) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> pd_op.tensor<1xi64>
-    (%141) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> pd_op.tensor<1xi64>
-    (%142) = "pd_op.slice" (%96, %140, %141) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x32x64xf16>
-    (%143) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xf32>
-    (%144) = "pd_op.scale" (%142, %143) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x64xf16>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x32x64xf16>
-    (%145) = "builtin.combine" (%144, %139) {} : (pd_op.tensor<-1x-1x32x64xf16>, pd_op.tensor<-1x-1x32x64xf16>) -> vec[pd_op.tensor<-1x-1x32x64xf16>,pd_op.tensor<-1x-1x32x64xf16>]
-    (%146) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xi32>
-    (%147) = "pd_op.concat" (%145, %146) {is_persistable:[false],stop_gradient:[false]} : (vec[pd_op.tensor<-1x-1x32x64xf16>,pd_op.tensor<-1x-1x32x64xf16>], pd_op.tensor<1xi32>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%148) = "pd_op.multiply" (%147, %134) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%149) = "pd_op.add" (%136, %148) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%150) = "pd_op.multiply" (%99, %127) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%151) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
-    (%152) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> pd_op.tensor<1xi64>
-    (%153) = "pd_op.slice" (%99, %151, %152) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x32x64xf16>
-    (%154) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> pd_op.tensor<1xi64>
-    (%155) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> pd_op.tensor<1xi64>
-    (%156) = "pd_op.slice" (%99, %154, %155) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1x32x64xf16>
-    (%157) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xf32>
-    (%158) = "pd_op.scale" (%156, %157) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x64xf16>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x32x64xf16>
-    (%159) = "builtin.combine" (%158, %153) {} : (pd_op.tensor<-1x-1x32x64xf16>, pd_op.tensor<-1x-1x32x64xf16>) -> vec[pd_op.tensor<-1x-1x32x64xf16>,pd_op.tensor<-1x-1x32x64xf16>]
-    (%160) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> pd_op.tensor<1xi32>
-    (%161) = "pd_op.concat" (%159, %160) {is_persistable:[false],stop_gradient:[false]} : (vec[pd_op.tensor<-1x-1x32x64xf16>,pd_op.tensor<-1x-1x32x64xf16>], pd_op.tensor<1xi32>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%162) = "pd_op.multiply" (%161, %134) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%163) = "pd_op.add" (%150, %162) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>, pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%164) = "pd_op.shape" (%149) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<4xi32>
-    (%165) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
-    (%166) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%167) = "pd_op.slice" (%164, %165, %166) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%168) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%169) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%170) = "pd_op.slice" (%164, %168, %169) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%171) = "pd_op.shape" (%102) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<4xi32>
-    (%172) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%173) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> pd_op.tensor<1xi64>
-    (%174) = "pd_op.slice" (%171, %172, %173) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<4xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%175) = "pd_op.transpose" (%149) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x32x-1x128xf16>
-    (%176) = "pd_op.transpose" (%163) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x32x-1x128xf16>
-    (%177) = "pd_op.transpose" (%102) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32x128xf16>) -> pd_op.tensor<-1x32x-1x128xf16>
-    (%178) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0.0883883} : () -> pd_op.tensor<1xf32>
-    (%179) = "pd_op.scale" (%175, %178) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x128xf16>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x32x-1x128xf16>
-    (%180) = "pd_op.transpose" (%176) {is_persistable:[false],perm:[(Int32)0,(Int32)1,(Int32)3,(Int32)2],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x128xf16>) -> pd_op.tensor<-1x32x128x-1xf16>
-    (%181) = "pd_op.matmul" (%179, %180) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x32x-1x128xf16>, pd_op.tensor<-1x32x128x-1xf16>) -> pd_op.tensor<-1x32x-1x-1xf16>
-    (%182) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<1xi32>
-    (%183) = "builtin.combine" (%167, %182, %170, %174) {} : (pd_op.tensor<i32>, pd_op.tensor<1xi32>, pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<1xi32>,pd_op.tensor<i32>,pd_op.tensor<i32>]
-    (%184, %185) = "pd_op.reshape" (%82, %183) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x1x-1x-1xf16>, vec[pd_op.tensor<i32>,pd_op.tensor<1xi32>,pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<-1x1x-1x-1xf16>, pd_op.tensor<0x-1x1x-1x-1xf16>
-    (%186) = "pd_op.add" (%181, %184) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x-1xf16>, pd_op.tensor<-1x1x-1x-1xf16>) -> pd_op.tensor<-1x32x-1x-1xf16>
-    (%187) = "pd_op.cast" (%186) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x-1xf16>) -> pd_op.tensor<-1x32x-1x-1xf32>
-    (%188) = "pd_op.softmax" (%187) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x-1xf32>) -> pd_op.tensor<-1x32x-1x-1xf32>
-    (%189) = "pd_op.cast" (%188) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x-1xf32>) -> pd_op.tensor<-1x32x-1x-1xf16>
-    (%190) = "pd_op.matmul" (%189, %177) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x32x-1x-1xf16>, pd_op.tensor<-1x32x-1x128xf16>) -> pd_op.tensor<-1x32x-1x128xf16>
-    (%191) = "pd_op.transpose" (%190) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (pd_op.tensor<-1x32x-1x128xf16>) -> pd_op.tensor<-1x-1x32x128xf16>
-    (%192) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)4096} : () -> pd_op.tensor<1xi32>
-    (%193) = "builtin.combine" (%167, %170, %192) {} : (pd_op.tensor<i32>, pd_op.tensor<i32>, pd_op.tensor<1xi32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<i32>,pd_op.tensor<1xi32>]
-    (%194, %195) = "pd_op.reshape" (%191, %193) {is_persistable:[false,false],stop_gradient:[false,false]} : (pd_op.tensor<-1x-1x32x128xf16>, vec[pd_op.tensor<i32>,pd_op.tensor<i32>,pd_op.tensor<1xi32>]) -> pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<0x-1x-1x32x128xf16>
-    (%196) = "pd_op.matmul" (%194, %7) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%197) = "pd_op.add" (%49, %196) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%198) = "pd_op.cast" (%197) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%199) = "pd_op.pow" (%198) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%200) = "pd_op.mean" (%199) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%201) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
-    (%202) = "pd_op.scale" (%200, %201) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%203) = "pd_op.rsqrt" (%202) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%204) = "pd_op.multiply" (%203, %198) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%205) = "pd_op.cast" (%204) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%206) = "pd_op.multiply" (%205, %8) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%207) = "pd_op.matmul" (%206, %9) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x11008xf16>) -> pd_op.tensor<-1x-1x11008xf16>
-    (%208) = "pd_op.matmul" (%206, %10) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x11008xf16>) -> pd_op.tensor<-1x-1x11008xf16>
-    (%209) = "pd_op.swiglu" (%207, %208) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x11008xf16>, pd_op.tensor<-1x-1x11008xf16>) -> pd_op.tensor<-1x-1x11008xf16>
-    (%210) = "pd_op.matmul" (%209, %11) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x11008xf16>, pd_op.tensor<11008x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%211) = "pd_op.add" (%197, %210) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%212) = "pd_op.cast" (%211) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%213) = "pd_op.pow" (%212) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%214) = "pd_op.mean" (%213) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%215) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
-    (%216) = "pd_op.scale" (%214, %215) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%217) = "pd_op.rsqrt" (%216) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>) -> pd_op.tensor<-1x-1x1xf32>
-    (%218) = "pd_op.multiply" (%217, %212) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x1xf32>, pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf32>
-    (%219) = "pd_op.cast" (%218) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf32>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%220) = "pd_op.multiply" (%219, %12) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096xf16>) -> pd_op.tensor<-1x-1x4096xf16>
-    (%221) = "pd_op.matmul" (%220, %13) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (pd_op.tensor<-1x-1x4096xf16>, pd_op.tensor<4096x32000xf16>) -> pd_op.tensor<-1x-1x32000xf16>
-    (%222) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> pd_op.tensor<1xi64>
-    (%223) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> pd_op.tensor<1xi64>
-    (%224) = "pd_op.slice" (%221, %222, %223) {axes:[(Int64)1],decrease_axis:[(Int64)1],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1x32000xf16>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<-1x32000xf16>
-    (%225) = "pd_op.softmax" (%224) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32000xf16>) -> pd_op.tensor<-1x32000xf16>
-    (%226) = "pd_op.log_softmax" (%224) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32000xf16>) -> pd_op.tensor<-1x32000xf16>
-    (%227) = "pd_op.shape" (%225) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32000xf16>) -> pd_op.tensor<2xi32>
-    (%228) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> pd_op.tensor<1xi64>
-    (%229) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> pd_op.tensor<1xi64>
-    (%230) = "pd_op.slice" (%227, %228, %229) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<i32>
-    (%231) = "pd_op.cast" (%14) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xf32>) -> pd_op.tensor<1xf16>
-    (%232) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> pd_op.tensor<i32>
-    (%233) = "builtin.combine" (%230, %232) {} : (pd_op.tensor<i32>, pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>,pd_op.tensor<i32>]
-    (%234) = "pd_op.stack" (%233) {axis:(Int32)0,stop_gradient:[true]} : (vec[pd_op.tensor<i32>,pd_op.tensor<i32>]) -> pd_op.tensor<2xi32>
-    (%235) = "pd_op.full_with_tensor" (%234, %231) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<2xi32>, pd_op.tensor<1xf16>) -> pd_op.tensor<-1x1xf16>
-    (%236, %237) = "pd_op.top_p_sampling" (%225, %235, <<NULL VALUE>>) {is_persistable:[false,false],seed:(Int32)-1,stop_gradient:[false,false]} : (pd_op.tensor<-1x32000xf16>, pd_op.tensor<-1x1xf16>, <<NULL TYPE>>) -> pd_op.tensor<-1x1xf16>, pd_op.tensor<-1x1xi64>
-    (%238) = "pd_op.index_sample" (%226, %237) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x32000xf16>, pd_op.tensor<-1x1xi64>) -> pd_op.tensor<-1x1xf16>
-    (%239) = "pd_op.subtract" (%27, %34) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>, pd_op.tensor<1xi64>) -> pd_op.tensor<1xi64>
-    (%240) = "pd_op.cast" (%239) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>) -> pd_op.tensor<1xf16>
-    (%241) = "pd_op.multiply" (%44, %240) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xf16>, pd_op.tensor<1xf16>) -> pd_op.tensor<-1x1xf16>
-    (%242) = "pd_op.add" (%241, %238) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xf16>, pd_op.tensor<-1x1xf16>) -> pd_op.tensor<-1x1xf16>
-    (%243) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
-    (%244) = "pd_op.scale" (%239, %243) {bias:(Float)1,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>, pd_op.tensor<1xf32>) -> pd_op.tensor<1xi64>
-    (%245) = "pd_op.cast" (%244) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<1xi64>) -> pd_op.tensor<1xf16>
-    (%246) = "pd_op.divide" (%242, %245) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xf16>, pd_op.tensor<1xf16>) -> pd_op.tensor<-1x1xf16>
-    (%247) = "pd_op.where" (%39, %246, %44) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xb>, pd_op.tensor<-1x1xf16>, pd_op.tensor<-1x1xf16>) -> pd_op.tensor<-1x1xf16>
-    (%248) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> pd_op.tensor<1xf32>
-    (%249) = "pd_op.full_like" (%237, %248) {dtype:(pd_op.DataType)int64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (pd_op.tensor<-1x1xi64>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1xi64>
-    (%250) = "pd_op.where" (%39, %237, %249) {is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xb>, pd_op.tensor<-1x1xi64>, pd_op.tensor<-1x1xi64>) -> pd_op.tensor<-1x1xi64>
-    (%251) = "builtin.combine" (%17, %250) {} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<-1x1xi64>) -> vec[pd_op.tensor<-1x-1xi64>,pd_op.tensor<-1x1xi64>]
-    (%252) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xi32>
-    (%253) = "pd_op.concat" (%251, %252) {is_persistable:[false],stop_gradient:[false]} : (vec[pd_op.tensor<-1x-1xi64>,pd_op.tensor<-1x1xi64>], pd_op.tensor<1xi32>) -> pd_op.tensor<-1x-1xi64>
-    (%254) = "builtin.combine" (%31) {} : (pd_op.tensor<i32>) -> vec[pd_op.tensor<i32>]
-    (%255) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> pd_op.tensor<1xi64>
-    (%256) = "pd_op.slice" (%253, %254, %255) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>, vec[pd_op.tensor<i32>], pd_op.tensor<1xi64>) -> pd_op.tensor<-1x-1xi64>
-    (%257) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
-    (%258) = "pd_op.scale" (%256, %257) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x-1xi64>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x-1xi64>
-    (%259) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> pd_op.tensor<1xf32>
-    (%260) = "pd_op.scale" (%247, %259) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (pd_op.tensor<-1x1xf16>, pd_op.tensor<1xf32>) -> pd_op.tensor<-1x1xf16>
-    (%261) = "pd_op.fetch" (%258) {col:(Int32)0,name:"save_infer_model/scale_0.tmp_0"} : (pd_op.tensor<-1x-1xi64>) -> pd_op.tensor<-1x-1xi64>
-    (%262) = "pd_op.fetch" (%260) {col:(Int32)1,name:"save_infer_model/scale_1.tmp_0"} : (pd_op.tensor<-1x1xf16>) -> pd_op.tensor<-1x1xf16>
+    (%0) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"embedding_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[32000,4096],stop_gradient:[true]} : () -> builtin.tensor<32000x4096xf16>
+    (%1) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> builtin.tensor<4096xf16>
+    (%2) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> builtin.tensor<4096x4096xf16>
+    (%3) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_1.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> builtin.tensor<4096x4096xf16>
+    (%4) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_2.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> builtin.tensor<4096x4096xf16>
+    (%5) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"eager_tmp_1",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1,2048,1,128],stop_gradient:[true]} : () -> builtin.tensor<1x2048x1x128xf32>
+    (%6) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"eager_tmp_2",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1,2048,1,128],stop_gradient:[true]} : () -> builtin.tensor<1x2048x1x128xf32>
+    (%7) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_3.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,4096],stop_gradient:[true]} : () -> builtin.tensor<4096x4096xf16>
+    (%8) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_1.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> builtin.tensor<4096xf16>
+    (%9) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_4.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,11008],stop_gradient:[true]} : () -> builtin.tensor<4096x11008xf16>
+    (%10) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_5.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,11008],stop_gradient:[true]} : () -> builtin.tensor<4096x11008xf16>
+    (%11) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"linear_6.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[11008,4096],stop_gradient:[true]} : () -> builtin.tensor<11008x4096xf16>
+    (%12) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"create_parameter_2.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096],stop_gradient:[true]} : () -> builtin.tensor<4096xf16>
+    (%13) = "pd_op.data" () {dtype:(pd_op.DataType)float16,name:"llama_lm_head_0.w_0",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[4096,32000],stop_gradient:[true]} : () -> builtin.tensor<4096x32000xf16>
+    (%14) = "pd_op.data" () {dtype:(pd_op.DataType)float32,name:"top_p",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[true]} : () -> builtin.tensor<1xf32>
+    (%15) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"position_ids",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> builtin.tensor<-1x-1xi64>
+    (%16) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"attention_mask",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> builtin.tensor<-1x-1xi64>
+    (%17) = "pd_op.data" () {dtype:(pd_op.DataType)int64,name:"input_ids",place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[-1,-1],stop_gradient:[true]} : () -> builtin.tensor<-1x-1xi64>
+    (%18) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>) -> builtin.tensor<2xi32>
+    (%19) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> builtin.tensor<1xi64>
+    (%20) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%21) = "pd_op.slice" (%18, %19, %20) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%22) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%23) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%24) = "pd_op.slice" (%18, %22, %23) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%25) = "pd_op.cast" (%24) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<i32>) -> builtin.tensor<i64>
+    (%26) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%27) = "pd_op.full_with_tensor" (%26, %25) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>, builtin.tensor<i64>) -> builtin.tensor<1xi64>
+    (%28) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>) -> builtin.tensor<2xi32>
+    (%29) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%30) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%31) = "pd_op.slice" (%28, %29, %30) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%32) = "pd_op.cast" (%31) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<i32>) -> builtin.tensor<i64>
+    (%33) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%34) = "pd_op.full_with_tensor" (%33, %32) {dtype:(pd_op.DataType)int64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>, builtin.tensor<i64>) -> builtin.tensor<1xi64>
+    (%35) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> builtin.tensor<i32>
+    (%36) = "builtin.combine" (%21, %35) {} : (builtin.tensor<i32>, builtin.tensor<i32>) -> vec[builtin.tensor<i32>,builtin.tensor<i32>]
+    (%37) = "pd_op.stack" (%36) {axis:(Int32)0,stop_gradient:[true]} : (vec[builtin.tensor<i32>,builtin.tensor<i32>]) -> builtin.tensor<2xi32>
+    (%38) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xf32>
+    (%39) = "pd_op.full_with_tensor" (%37, %38) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xf32>) -> builtin.tensor<-1x1xb>
+    (%40) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> builtin.tensor<i32>
+    (%41) = "builtin.combine" (%21, %40) {} : (builtin.tensor<i32>, builtin.tensor<i32>) -> vec[builtin.tensor<i32>,builtin.tensor<i32>]
+    (%42) = "pd_op.stack" (%41) {axis:(Int32)0,stop_gradient:[true]} : (vec[builtin.tensor<i32>,builtin.tensor<i32>]) -> builtin.tensor<2xi32>
+    (%43) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xf32>
+    (%44) = "pd_op.full_with_tensor" (%42, %43) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xf32>) -> builtin.tensor<-1x1xf16>
+    (%45) = "pd_op.shape" (%17) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>) -> builtin.tensor<2xi32>
+    (%46) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%47) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%48) = "pd_op.slice" (%45, %46, %47) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%49) = "pd_op.embedding" (%17, %0) {is_persistable:[false],padding_idx:(Int64)-1,sparse:false,stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>, builtin.tensor<32000x4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%50) = "pd_op.shape" (%16) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>) -> builtin.tensor<2xi32>
+    (%51) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> builtin.tensor<1xi64>
+    (%52) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%53) = "pd_op.slice" (%50, %51, %52) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%54) = "pd_op.shape" (%16) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>) -> builtin.tensor<2xi32>
+    (%55) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%56) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%57) = "pd_op.slice" (%54, %55, %56) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%58) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1,(Int64)2]} : () -> builtin.tensor<2xi64>
+    (%59, %60) = "pd_op.unsqueeze" (%16, %58) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1xi64>, builtin.tensor<2xi64>) -> builtin.tensor<-1x1x1x-1xi64>, <<NULL TYPE>>
+    (%61) = "pd_op.cast" (%59) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1x1x-1xi64>) -> builtin.tensor<-1x1x1x-1xb>
+    (%62) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)1} : () -> builtin.tensor<1xi32>
+    (%63) = "builtin.combine" (%53, %62, %48, %57) {} : (builtin.tensor<i32>, builtin.tensor<1xi32>, builtin.tensor<i32>, builtin.tensor<i32>) -> vec[builtin.tensor<i32>,builtin.tensor<1xi32>,builtin.tensor<i32>,builtin.tensor<i32>]
+    (%64) = "pd_op.expand" (%61, %63) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1x1x-1xb>, vec[builtin.tensor<i32>,builtin.tensor<1xi32>,builtin.tensor<i32>,builtin.tensor<i32>]) -> builtin.tensor<-1x1x-1x-1xb>
+    (%65) = "pd_op.full" () {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)0} : () -> builtin.tensor<1xf64>
+    (%66) = "pd_op.full" () {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)-65504} : () -> builtin.tensor<1xf64>
+    (%67) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xf32>
+    (%68) = "pd_op.full_like" (%65, %67) {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (builtin.tensor<1xf64>, builtin.tensor<1xf32>) -> builtin.tensor<1xf64>
+    (%69) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xf32>
+    (%70) = "pd_op.full_like" (%66, %69) {dtype:(pd_op.DataType)float64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (builtin.tensor<1xf64>, builtin.tensor<1xf32>) -> builtin.tensor<1xf64>
+    (%71) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xf32>
+    (%72) = "pd_op.full_like" (%64, %71) {dtype:(pd_op.DataType)bool,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (builtin.tensor<-1x1x-1x-1xb>, builtin.tensor<1xf32>) -> builtin.tensor<-1x1x-1x-1xb>
+    (%73) = "pd_op.cast" (%72) {dtype:(pd_op.DataType)float64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1x-1x-1xb>) -> builtin.tensor<-1x1x-1x-1xf64>
+    (%74) = "pd_op.cast" (%64) {dtype:(pd_op.DataType)float64,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1x-1x-1xb>) -> builtin.tensor<-1x1x-1x-1xf64>
+    (%75) = "pd_op.add" (%68, %70) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xf64>, builtin.tensor<1xf64>) -> builtin.tensor<1xf64>
+    (%76) = "pd_op.add" (%75, %73) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xf64>, builtin.tensor<-1x1x-1x-1xf64>) -> builtin.tensor<-1x1x-1x-1xf64>
+    (%77) = "pd_op.add" (%65, %76) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xf64>, builtin.tensor<-1x1x-1x-1xf64>) -> builtin.tensor<-1x1x-1x-1xf64>
+    (%78) = "pd_op.add" (%66, %76) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xf64>, builtin.tensor<-1x1x-1x-1xf64>) -> builtin.tensor<-1x1x-1x-1xf64>
+    (%79) = "pd_op.add" (%74, %76) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1x-1x-1xf64>, builtin.tensor<-1x1x-1x-1xf64>) -> builtin.tensor<-1x1x-1x-1xf64>
+    (%80) = "pd_op.cast" (%79) {dtype:(pd_op.DataType)bool,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1x-1x-1xf64>) -> builtin.tensor<-1x1x-1x-1xb>
+    (%81) = "pd_op.where" (%80, %77, %78) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1x-1x-1xb>, builtin.tensor<-1x1x-1x-1xf64>, builtin.tensor<-1x1x-1x-1xf64>) -> builtin.tensor<-1x1x-1x-1xf64>
+    (%82) = "pd_op.cast" (%81) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1x-1x-1xf64>) -> builtin.tensor<-1x1x-1x-1xf16>
+    (%83) = "pd_op.cast" (%49) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf16>) -> builtin.tensor<-1x-1x4096xf32>
+    (%84) = "pd_op.pow" (%83) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf32>
+    (%85) = "pd_op.mean" (%84) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%86) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xf32>
+    (%87) = "pd_op.scale" (%85, %86) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>, builtin.tensor<1xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%88) = "pd_op.rsqrt" (%87) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%89) = "pd_op.multiply" (%88, %83) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>, builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf32>
+    (%90) = "pd_op.cast" (%89) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf16>
+    (%91) = "pd_op.multiply" (%90, %1) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%92) = "pd_op.matmul" (%91, %2) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096x4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%93) = "pd_op.matmul" (%91, %3) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096x4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%94) = "pd_op.matmul" (%91, %4) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096x4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%95) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> builtin.tensor<4xi64>
+    (%96, %97) = "pd_op.reshape" (%92, %95) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4xi64>) -> builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<0x-1x-1x4096xf16>
+    (%98) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> builtin.tensor<4xi64>
+    (%99, %100) = "pd_op.reshape" (%93, %98) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4xi64>) -> builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<0x-1x-1x4096xf16>
+    (%101) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)0,(Int64)32,(Int64)128]} : () -> builtin.tensor<4xi64>
+    (%102, %103) = "pd_op.reshape" (%94, %101) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4xi64>) -> builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<0x-1x-1x4096xf16>
+    (%104) = "pd_op.shape" (%99) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>) -> builtin.tensor<4xi32>
+    (%105) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%106) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%107) = "pd_op.slice" (%104, %105, %106) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<4xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%108) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> builtin.tensor<1xi64>
+    (%109) = "builtin.combine" (%107) {} : (builtin.tensor<i32>) -> vec[builtin.tensor<i32>]
+    (%110) = "pd_op.slice" (%5, %108, %109) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1x2048x1x128xf32>, builtin.tensor<1xi64>, vec[builtin.tensor<i32>]) -> builtin.tensor<1x-1x1x128xf32>
+    (%111) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> builtin.tensor<1xi64>
+    (%112) = "builtin.combine" (%107) {} : (builtin.tensor<i32>) -> vec[builtin.tensor<i32>]
+    (%113) = "pd_op.slice" (%6, %111, %112) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1x2048x1x128xf32>, builtin.tensor<1xi64>, vec[builtin.tensor<i32>]) -> builtin.tensor<1x-1x1x128xf32>
+    (%114) = "pd_op.cast" (%110) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1x-1x1x128xf32>) -> builtin.tensor<1x-1x1x128xf16>
+    (%115) = "pd_op.cast" (%113) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1x-1x1x128xf32>) -> builtin.tensor<1x-1x1x128xf16>
+    (%116) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)2]} : () -> builtin.tensor<2xi64>
+    (%117, %118) = "pd_op.squeeze" (%114, %116) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<1x-1x1x128xf16>, builtin.tensor<2xi64>) -> builtin.tensor<-1x128xf16>, <<NULL TYPE>>
+    (%119) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0,(Int64)2]} : () -> builtin.tensor<2xi64>
+    (%120, %121) = "pd_op.squeeze" (%115, %119) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<1x-1x1x128xf16>, builtin.tensor<2xi64>) -> builtin.tensor<-1x128xf16>, <<NULL TYPE>>
+    (%122) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> builtin.tensor<1xi64>
+    (%123, %124) = "pd_op.unsqueeze" (%15, %122) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<-1x-1x1xi64>, <<NULL TYPE>>
+    (%125) = "pd_op.gather_nd" (%117, %123) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x128xf16>, builtin.tensor<-1x-1x1xi64>) -> builtin.tensor<-1x-1x128xf16>
+    (%126) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%127, %128) = "pd_op.unsqueeze" (%125, %126) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1x128xf16>, builtin.tensor<1xi64>) -> builtin.tensor<-1x-1x1x128xf16>, <<NULL TYPE>>
+    (%129) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> builtin.tensor<1xi64>
+    (%130, %131) = "pd_op.unsqueeze" (%15, %129) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<-1x-1x1xi64>, <<NULL TYPE>>
+    (%132) = "pd_op.gather_nd" (%120, %130) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x128xf16>, builtin.tensor<-1x-1x1xi64>) -> builtin.tensor<-1x-1x128xf16>
+    (%133) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%134, %135) = "pd_op.unsqueeze" (%132, %133) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1x128xf16>, builtin.tensor<1xi64>) -> builtin.tensor<-1x-1x1x128xf16>, <<NULL TYPE>>
+    (%136) = "pd_op.multiply" (%96, %127) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<-1x-1x1x128xf16>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%137) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> builtin.tensor<1xi64>
+    (%138) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> builtin.tensor<1xi64>
+    (%139) = "pd_op.slice" (%96, %137, %138) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<-1x-1x32x64xf16>
+    (%140) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> builtin.tensor<1xi64>
+    (%141) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> builtin.tensor<1xi64>
+    (%142) = "pd_op.slice" (%96, %140, %141) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<-1x-1x32x64xf16>
+    (%143) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> builtin.tensor<1xf32>
+    (%144) = "pd_op.scale" (%142, %143) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x64xf16>, builtin.tensor<1xf32>) -> builtin.tensor<-1x-1x32x64xf16>
+    (%145) = "builtin.combine" (%144, %139) {} : (builtin.tensor<-1x-1x32x64xf16>, builtin.tensor<-1x-1x32x64xf16>) -> vec[builtin.tensor<-1x-1x32x64xf16>,builtin.tensor<-1x-1x32x64xf16>]
+    (%146) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> builtin.tensor<1xi32>
+    (%147) = "pd_op.concat" (%145, %146) {is_persistable:[false],stop_gradient:[false]} : (vec[builtin.tensor<-1x-1x32x64xf16>,builtin.tensor<-1x-1x32x64xf16>], builtin.tensor<1xi32>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%148) = "pd_op.multiply" (%147, %134) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<-1x-1x1x128xf16>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%149) = "pd_op.add" (%136, %148) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<-1x-1x32x128xf16>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%150) = "pd_op.multiply" (%99, %127) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<-1x-1x1x128xf16>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%151) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> builtin.tensor<1xi64>
+    (%152) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> builtin.tensor<1xi64>
+    (%153) = "pd_op.slice" (%99, %151, %152) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<-1x-1x32x64xf16>
+    (%154) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)64]} : () -> builtin.tensor<1xi64>
+    (%155) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> builtin.tensor<1xi64>
+    (%156) = "pd_op.slice" (%99, %154, %155) {axes:[(Int64)3],decrease_axis:[],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<-1x-1x32x64xf16>
+    (%157) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> builtin.tensor<1xf32>
+    (%158) = "pd_op.scale" (%156, %157) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x64xf16>, builtin.tensor<1xf32>) -> builtin.tensor<-1x-1x32x64xf16>
+    (%159) = "builtin.combine" (%158, %153) {} : (builtin.tensor<-1x-1x32x64xf16>, builtin.tensor<-1x-1x32x64xf16>) -> vec[builtin.tensor<-1x-1x32x64xf16>,builtin.tensor<-1x-1x32x64xf16>]
+    (%160) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)-1} : () -> builtin.tensor<1xi32>
+    (%161) = "pd_op.concat" (%159, %160) {is_persistable:[false],stop_gradient:[false]} : (vec[builtin.tensor<-1x-1x32x64xf16>,builtin.tensor<-1x-1x32x64xf16>], builtin.tensor<1xi32>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%162) = "pd_op.multiply" (%161, %134) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<-1x-1x1x128xf16>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%163) = "pd_op.add" (%150, %162) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>, builtin.tensor<-1x-1x32x128xf16>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%164) = "pd_op.shape" (%149) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>) -> builtin.tensor<4xi32>
+    (%165) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> builtin.tensor<1xi64>
+    (%166) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%167) = "pd_op.slice" (%164, %165, %166) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<4xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%168) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%169) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%170) = "pd_op.slice" (%164, %168, %169) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<4xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%171) = "pd_op.shape" (%102) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>) -> builtin.tensor<4xi32>
+    (%172) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%173) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2]} : () -> builtin.tensor<1xi64>
+    (%174) = "pd_op.slice" (%171, %172, %173) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<4xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%175) = "pd_op.transpose" (%149) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>) -> builtin.tensor<-1x32x-1x128xf16>
+    (%176) = "pd_op.transpose" (%163) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>) -> builtin.tensor<-1x32x-1x128xf16>
+    (%177) = "pd_op.transpose" (%102) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (builtin.tensor<-1x-1x32x128xf16>) -> builtin.tensor<-1x32x-1x128xf16>
+    (%178) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0.0883883} : () -> builtin.tensor<1xf32>
+    (%179) = "pd_op.scale" (%175, %178) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32x-1x128xf16>, builtin.tensor<1xf32>) -> builtin.tensor<-1x32x-1x128xf16>
+    (%180) = "pd_op.transpose" (%176) {is_persistable:[false],perm:[(Int32)0,(Int32)1,(Int32)3,(Int32)2],stop_gradient:[false]} : (builtin.tensor<-1x32x-1x128xf16>) -> builtin.tensor<-1x32x128x-1xf16>
+    (%181) = "pd_op.matmul" (%179, %180) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x32x-1x128xf16>, builtin.tensor<-1x32x128x-1xf16>) -> builtin.tensor<-1x32x-1x-1xf16>
+    (%182) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)1} : () -> builtin.tensor<1xi32>
+    (%183) = "builtin.combine" (%167, %182, %170, %174) {} : (builtin.tensor<i32>, builtin.tensor<1xi32>, builtin.tensor<i32>, builtin.tensor<i32>) -> vec[builtin.tensor<i32>,builtin.tensor<1xi32>,builtin.tensor<i32>,builtin.tensor<i32>]
+    (%184, %185) = "pd_op.reshape" (%82, %183) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x1x-1x-1xf16>, vec[builtin.tensor<i32>,builtin.tensor<1xi32>,builtin.tensor<i32>,builtin.tensor<i32>]) -> builtin.tensor<-1x1x-1x-1xf16>, builtin.tensor<0x-1x1x-1x-1xf16>
+    (%186) = "pd_op.add" (%181, %184) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32x-1x-1xf16>, builtin.tensor<-1x1x-1x-1xf16>) -> builtin.tensor<-1x32x-1x-1xf16>
+    (%187) = "pd_op.cast" (%186) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32x-1x-1xf16>) -> builtin.tensor<-1x32x-1x-1xf32>
+    (%188) = "pd_op.softmax" (%187) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32x-1x-1xf32>) -> builtin.tensor<-1x32x-1x-1xf32>
+    (%189) = "pd_op.cast" (%188) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32x-1x-1xf32>) -> builtin.tensor<-1x32x-1x-1xf16>
+    (%190) = "pd_op.matmul" (%189, %177) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x32x-1x-1xf16>, builtin.tensor<-1x32x-1x128xf16>) -> builtin.tensor<-1x32x-1x128xf16>
+    (%191) = "pd_op.transpose" (%190) {is_persistable:[false],perm:[(Int32)0,(Int32)2,(Int32)1,(Int32)3],stop_gradient:[false]} : (builtin.tensor<-1x32x-1x128xf16>) -> builtin.tensor<-1x-1x32x128xf16>
+    (%192) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[false],value:(Float)4096} : () -> builtin.tensor<1xi32>
+    (%193) = "builtin.combine" (%167, %170, %192) {} : (builtin.tensor<i32>, builtin.tensor<i32>, builtin.tensor<1xi32>) -> vec[builtin.tensor<i32>,builtin.tensor<i32>,builtin.tensor<1xi32>]
+    (%194, %195) = "pd_op.reshape" (%191, %193) {is_persistable:[false,false],stop_gradient:[false,false]} : (builtin.tensor<-1x-1x32x128xf16>, vec[builtin.tensor<i32>,builtin.tensor<i32>,builtin.tensor<1xi32>]) -> builtin.tensor<-1x-1x4096xf16>, builtin.tensor<0x-1x-1x32x128xf16>
+    (%196) = "pd_op.matmul" (%194, %7) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096x4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%197) = "pd_op.add" (%49, %196) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<-1x-1x4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%198) = "pd_op.cast" (%197) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf16>) -> builtin.tensor<-1x-1x4096xf32>
+    (%199) = "pd_op.pow" (%198) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf32>
+    (%200) = "pd_op.mean" (%199) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%201) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xf32>
+    (%202) = "pd_op.scale" (%200, %201) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>, builtin.tensor<1xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%203) = "pd_op.rsqrt" (%202) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%204) = "pd_op.multiply" (%203, %198) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>, builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf32>
+    (%205) = "pd_op.cast" (%204) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf16>
+    (%206) = "pd_op.multiply" (%205, %8) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%207) = "pd_op.matmul" (%206, %9) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096x11008xf16>) -> builtin.tensor<-1x-1x11008xf16>
+    (%208) = "pd_op.matmul" (%206, %10) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096x11008xf16>) -> builtin.tensor<-1x-1x11008xf16>
+    (%209) = "pd_op.swiglu" (%207, %208) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x11008xf16>, builtin.tensor<-1x-1x11008xf16>) -> builtin.tensor<-1x-1x11008xf16>
+    (%210) = "pd_op.matmul" (%209, %11) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x-1x11008xf16>, builtin.tensor<11008x4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%211) = "pd_op.add" (%197, %210) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<-1x-1x4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%212) = "pd_op.cast" (%211) {dtype:(pd_op.DataType)float32,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf16>) -> builtin.tensor<-1x-1x4096xf32>
+    (%213) = "pd_op.pow" (%212) {is_persistable:[false],stop_gradient:[false],y:(Float)2} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf32>
+    (%214) = "pd_op.mean" (%213) {axis:(pd_op.IntArray)[-1],is_persistable:[false],keepdim:true,stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%215) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xf32>
+    (%216) = "pd_op.scale" (%214, %215) {bias:(Float)1e-06,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>, builtin.tensor<1xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%217) = "pd_op.rsqrt" (%216) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>) -> builtin.tensor<-1x-1x1xf32>
+    (%218) = "pd_op.multiply" (%217, %212) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x1xf32>, builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf32>
+    (%219) = "pd_op.cast" (%218) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf32>) -> builtin.tensor<-1x-1x4096xf16>
+    (%220) = "pd_op.multiply" (%219, %12) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096xf16>) -> builtin.tensor<-1x-1x4096xf16>
+    (%221) = "pd_op.matmul" (%220, %13) {is_persistable:[false],stop_gradient:[false],transpose_x:false,transpose_y:false} : (builtin.tensor<-1x-1x4096xf16>, builtin.tensor<4096x32000xf16>) -> builtin.tensor<-1x-1x32000xf16>
+    (%222) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)-1]} : () -> builtin.tensor<1xi64>
+    (%223) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> builtin.tensor<1xi64>
+    (%224) = "pd_op.slice" (%221, %222, %223) {axes:[(Int64)1],decrease_axis:[(Int64)1],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1x32000xf16>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<-1x32000xf16>
+    (%225) = "pd_op.softmax" (%224) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32000xf16>) -> builtin.tensor<-1x32000xf16>
+    (%226) = "pd_op.log_softmax" (%224) {axis:(Int32)-1,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32000xf16>) -> builtin.tensor<-1x32000xf16>
+    (%227) = "pd_op.shape" (%225) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32000xf16>) -> builtin.tensor<2xi32>
+    (%228) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)0]} : () -> builtin.tensor<1xi64>
+    (%229) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)1]} : () -> builtin.tensor<1xi64>
+    (%230) = "pd_op.slice" (%227, %228, %229) {axes:[(Int64)0],decrease_axis:[(Int64)0],infer_flags:[(Int64)1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<i32>
+    (%231) = "pd_op.cast" (%14) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xf32>) -> builtin.tensor<1xf16>
+    (%232) = "pd_op.full" () {dtype:(pd_op.DataType)int32,is_persistable:[false],place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[],stop_gradient:[false],value:(Float)1} : () -> builtin.tensor<i32>
+    (%233) = "builtin.combine" (%230, %232) {} : (builtin.tensor<i32>, builtin.tensor<i32>) -> vec[builtin.tensor<i32>,builtin.tensor<i32>]
+    (%234) = "pd_op.stack" (%233) {axis:(Int32)0,stop_gradient:[true]} : (vec[builtin.tensor<i32>,builtin.tensor<i32>]) -> builtin.tensor<2xi32>
+    (%235) = "pd_op.full_with_tensor" (%234, %231) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<2xi32>, builtin.tensor<1xf16>) -> builtin.tensor<-1x1xf16>
+    (%236, %237) = "pd_op.top_p_sampling" (%225, %235, <<NULL VALUE>>) {is_persistable:[false,false],seed:(Int32)-1,stop_gradient:[false,false]} : (builtin.tensor<-1x32000xf16>, builtin.tensor<-1x1xf16>, <<NULL TYPE>>) -> builtin.tensor<-1x1xf16>, builtin.tensor<-1x1xi64>
+    (%238) = "pd_op.index_sample" (%226, %237) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x32000xf16>, builtin.tensor<-1x1xi64>) -> builtin.tensor<-1x1xf16>
+    (%239) = "pd_op.subtract" (%27, %34) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>, builtin.tensor<1xi64>) -> builtin.tensor<1xi64>
+    (%240) = "pd_op.cast" (%239) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>) -> builtin.tensor<1xf16>
+    (%241) = "pd_op.multiply" (%44, %240) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1xf16>, builtin.tensor<1xf16>) -> builtin.tensor<-1x1xf16>
+    (%242) = "pd_op.add" (%241, %238) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1xf16>, builtin.tensor<-1x1xf16>) -> builtin.tensor<-1x1xf16>
+    (%243) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xf32>
+    (%244) = "pd_op.scale" (%239, %243) {bias:(Float)1,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>, builtin.tensor<1xf32>) -> builtin.tensor<1xi64>
+    (%245) = "pd_op.cast" (%244) {dtype:(pd_op.DataType)float16,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<1xi64>) -> builtin.tensor<1xf16>
+    (%246) = "pd_op.divide" (%242, %245) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1xf16>, builtin.tensor<1xf16>) -> builtin.tensor<-1x1xf16>
+    (%247) = "pd_op.where" (%39, %246, %44) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1xb>, builtin.tensor<-1x1xf16>, builtin.tensor<-1x1xf16>) -> builtin.tensor<-1x1xf16>
+    (%248) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)0} : () -> builtin.tensor<1xf32>
+    (%249) = "pd_op.full_like" (%237, %248) {dtype:(pd_op.DataType)int64,is_persistable:[false],place:(pd_op.Place)Place(undefined:0),stop_gradient:[false]} : (builtin.tensor<-1x1xi64>, builtin.tensor<1xf32>) -> builtin.tensor<-1x1xi64>
+    (%250) = "pd_op.where" (%39, %237, %249) {is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1xb>, builtin.tensor<-1x1xi64>, builtin.tensor<-1x1xi64>) -> builtin.tensor<-1x1xi64>
+    (%251) = "builtin.combine" (%17, %250) {} : (builtin.tensor<-1x-1xi64>, builtin.tensor<-1x1xi64>) -> vec[builtin.tensor<-1x-1xi64>,builtin.tensor<-1x1xi64>]
+    (%252) = "pd_op.full" () {dtype:(pd_op.DataType)int32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xi32>
+    (%253) = "pd_op.concat" (%251, %252) {is_persistable:[false],stop_gradient:[false]} : (vec[builtin.tensor<-1x-1xi64>,builtin.tensor<-1x1xi64>], builtin.tensor<1xi32>) -> builtin.tensor<-1x-1xi64>
+    (%254) = "builtin.combine" (%31) {} : (builtin.tensor<i32>) -> vec[builtin.tensor<i32>]
+    (%255) = "pd_op.full_int_array" () {dtype:(pd_op.DataType)int64,place:(pd_op.Place)Place(cpu),stop_gradient:[true],value:[(Int64)2147483647]} : () -> builtin.tensor<1xi64>
+    (%256) = "pd_op.slice" (%253, %254, %255) {axes:[(Int64)1],decrease_axis:[],infer_flags:[(Int64)-1],is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>, vec[builtin.tensor<i32>], builtin.tensor<1xi64>) -> builtin.tensor<-1x-1xi64>
+    (%257) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xf32>
+    (%258) = "pd_op.scale" (%256, %257) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x-1xi64>, builtin.tensor<1xf32>) -> builtin.tensor<-1x-1xi64>
+    (%259) = "pd_op.full" () {dtype:(pd_op.DataType)float32,place:(pd_op.Place)Place(cpu),shape:(pd_op.IntArray)[1],stop_gradient:[true],value:(Float)1} : () -> builtin.tensor<1xf32>
+    (%260) = "pd_op.scale" (%247, %259) {bias:(Float)0,bias_after_scale:true,is_persistable:[false],stop_gradient:[false]} : (builtin.tensor<-1x1xf16>, builtin.tensor<1xf32>) -> builtin.tensor<-1x1xf16>
+    (%261) = "pd_op.fetch" (%258) {col:(Int32)0,name:"save_infer_model/scale_0.tmp_0"} : (builtin.tensor<-1x-1xi64>) -> builtin.tensor<-1x-1xi64>
+    (%262) = "pd_op.fetch" (%260) {col:(Int32)1,name:"save_infer_model/scale_1.tmp_0"} : (builtin.tensor<-1x1xf16>) -> builtin.tensor<-1x1xf16>
 }
diff --git a/test/ir/pir/cinn/symbolic/test_llama_group_log_softmax.py b/test/ir/pir/cinn/symbolic/test_llama_group_log_softmax.py
index a99808951389e..602367573cf3b 100644
--- a/test/ir/pir/cinn/symbolic/test_llama_group_log_softmax.py
+++ b/test/ir/pir/cinn/symbolic/test_llama_group_log_softmax.py
@@ -48,7 +48,7 @@ def tmp(logits, scores, next_tokens, length):
 
     next_scores = paddle.index_sample(
         origin_probs, next_tokens
-    )  # (pd_op.tensor<-1x32000xf16>, pd_op.tensor<-1x1xi64>) -> pd_op.tensor<-1x1xf16>
+    )  # (builtin.tensor<-1x32000xf16>, builtin.tensor<-1x1xi64>) -> builtin.tensor<-1x1xf16>
     scores = update_scores_for_generation(scores, next_scores, length)
     return scores
 
diff --git a/test/ir/pir/test_ir_pybind.py b/test/ir/pir/test_ir_pybind.py
index 460e5e489eb35..fd0aee950cc31 100644
--- a/test/ir/pir/test_ir_pybind.py
+++ b/test/ir/pir/test_ir_pybind.py
@@ -115,7 +115,7 @@ def test_value(self):
         )
         # test opresult print
         self.assertTrue(
-            'dtype=pd_op.tensor<4x4xf32>'
+            'dtype=builtin.tensor<4x4xf32>'
             in add_op.operands_source()[0].__str__()
         )
         # test opresult == value
@@ -132,7 +132,8 @@ def test_value(self):
             tanh_op.operands()[0].source().get_defining_op().name(), "pd_op.add"
         )
         self.assertTrue(
-            'pd_op.tensor<4x4xf32>' in tanh_op.operands()[0].source().__str__()
+            'builtin.tensor<4x4xf32>'
+            in tanh_op.operands()[0].source().__str__()
         )
         add_op.replace_all_uses_with(matmul_op.results())
         self.assertEqual(
diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py
index cc0afe5202fd1..33e6aef4a68c9 100644
--- a/test/legacy_test/test_fused_rotary_position_embedding.py
+++ b/test/legacy_test/test_fused_rotary_position_embedding.py
@@ -461,7 +461,7 @@ def test_static(self):
         for x, out in zip([q, k, v], [out_q, out_k, out_v]):
             # The reason why fetch `out` based on `x` is that
             # if input is None, the output of static function might be not NoneType
-            # but pir.Value with type pd_op.tensor<0xf32> in pir mode.
+            # but pir.Value with type builtin.tensor<0xf32> in pir mode.
             if x is not None:
                 fetch_list.append(out)
 
@@ -575,7 +575,7 @@ def test_static_time_major(self):
         for x, out in zip([q, k, v], [out_q, out_k, out_v]):
             # The reason why fetch `out` based on `x` is that
             # if input is None, the output of static function might be not NoneType
-            # but pir.Value with type pd_op.tensor<0xf32> in pir mode.
+            # but pir.Value with type builtin.tensor<0xf32> in pir mode.
             if x is not None:
                 fetch_list.append(out)
 

From b8c49369a96b489da8d51c1bd223d402548d73ba Mon Sep 17 00:00:00 2001
From: Tian <121000916+SylarTiaNII@users.noreply.github.com>
Date: Thu, 7 Mar 2024 15:06:29 +0800
Subject: [PATCH 056/114] [CustomDevice] fix anomalous memory usage on custom
 devices (#62377)

---
 .../eager_manual/forwards/multiply_fwd_func.cc      | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
index 9d1451c74e65f..aa18f8cd4acb8 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
@@ -27,6 +27,15 @@
 
 COMMON_DECLARE_bool(check_nan_inf);
 
+bool check_if_support_elementwise_mul_mem_opt(const std::string& device_type) {
+  // TODO(@gexiao): replace this function with api implemented at custom repo
+  if (device_type == "npu") {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
                                 const paddle::Tensor& y) {
   FLAGS_tensor_operants_mode = "eager";
@@ -160,7 +169,11 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
     }
     // SetAttributes if needed
     grad_node->SetAttribute_axis(-1);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    if (check_if_support_elementwise_mul_mem_opt(x.place().GetDeviceType())) {
+#else
     if (paddle::platform::is_gpu_place(x.place())) {
+#endif
       if (x_autograd_meta != nullptr && x_autograd_meta->StopGradient() &&
           y_autograd_meta != nullptr && !y_autograd_meta->StopGradient()) {
         grad_node->SetTensorWrapper_x(x);

From 660276aa08136f91e1b1660a7bfdbf3041ca4691 Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Thu, 7 Mar 2024 15:55:17 +0800
Subject: [PATCH 057/114] fix reduce avg bug (#62502)

---
 python/paddle/distributed/fleet/utils/tensor_fusion_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
index 82bf2ce38b2e4..14141c64e1278 100644
--- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
+++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
@@ -622,7 +622,7 @@ def scale_grads(self):
         self._task.wait()
 
         # scale will be skiped when use reduce_avg comm operation
-        if self._scale_after_comm and not self.use_reduce_avg:
+        if self._scale_after_comm and not self._use_reduce_avg:
             scale_factor = 1.0 / self._comm_group.nranks
             self.grad_storage.scale_(scale_factor)
 

From 7129945f12c03a776734592c65ffb4235e773f25 Mon Sep 17 00:00:00 2001
From: Jia Wenxuan <64853160+JiaWenxuan@users.noreply.github.com>
Date: Thu, 7 Mar 2024 16:07:35 +0800
Subject: [PATCH 058/114] Fix ShapeOrDataDimExpr simplify unwork (#62376)

* update test case

* fix

* fix concat op infer symbolic

* fix some bugs

* fix some bugs

* fix some bugs

* fix some bugs

* fix some bugs
---
 .../operator/transforms/add_cinn_pass.cc      |  6 +--
 .../group_merge/simplify_dim_expr_pass.cc     | 42 ++++++++++-------
 ...tute_dim_expr_based_on_constraints_pass.cc | 45 +++++++++++++------
 3 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
index 91bfad2d5710d..07732ac0c8952 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/add_cinn_pass.cc
@@ -94,9 +94,6 @@ void ApplyCinnPreprocessPass(
   if (has_dynamic_shape) {
     pass_manager->AddPass(cinn::dialect::ir::CreateConvert0DTo1DPass());
     pass_manager->AddPass(pir::CreateShapeOptimizationPass());
-    pass_manager->AddPass(cinn::dialect::ir::CreateSimplifyDimExprPass());
-    pass_manager->AddPass(
-        cinn::dialect::ir::CreateSubstituteDimExprBasedOnConstraintsPass());
     pass_manager->AddPass(cinn::dialect::ir::CreateConvert0DTo1DPass());
     pass_manager->AddPass(
         cinn::dialect::ir::CreateFuseShapeOpsIntoGenerateShapeOpPass());
@@ -130,6 +127,9 @@ void ApplyGroupOpPass(::pir::Program* program,
         cinn::dialect::ir::CreateFuseShapeOpsIntoGenerateShapeOpPass());
     pass_manager->AddPass(
         cinn::dialect::ir::CreateMoveGenerateShapeOpsToProloguePass());
+    pass_manager->AddPass(
+        cinn::dialect::ir::CreateSubstituteDimExprBasedOnConstraintsPass());
+    pass_manager->AddPass(cinn::dialect::ir::CreateSimplifyDimExprPass());
   }
 
   pass_manager->AddPass(cinn::dialect::ir::CreateDynamicReshapeOpPass());
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.cc
index e8d8355872cd2..dcd92c7f4810d 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.cc
@@ -28,11 +28,14 @@ namespace ir {
 namespace {
 
 template <typename DoEachT>
-void VisitEachOp(pir::ModuleOp module_op, const DoEachT& DoEach) {
-  for (uint32_t i = 0; i < module_op->num_regions(); i++) {
-    for (pir::Block& block : module_op->region(i)) {
-      for (pir::Operation& op : block) {
-        DoEach(op);
+void VisitEachOp(pir::Operation* op, const DoEachT& DoEach) {
+  for (uint32_t i = 0; i < op->num_regions(); i++) {
+    for (pir::Block& block : op->region(i)) {
+      for (pir::Operation& sub_op : block) {
+        DoEach(sub_op);
+        if (sub_op.num_regions() > 0) {
+          VisitEachOp(&sub_op, DoEach);
+        }
       }
     }
   }
@@ -90,24 +93,36 @@ symbol::ShapeOrDataDimExprs SimplifyShapeOrData(
   return std::visit(lambdas, shape_or_data.variant());
 }
 
-void SimplifyDimExpr(pir::ModuleOp module_op) {
+void SimplifyDimExpr(pir::Operation* module_op) {
   VLOG(4) << "SimplifyDimExpr start";
-  pir::ShapeConstraintIRAnalysis shape_analysis =
-      pir::ShapeAnalysisManager::Instance().Get(module_op.program());
+  pir::ShapeConstraintIRAnalysis* shape_analysis =
+      &pir::ShapeAnalysisManager::Instance().Get(
+          module_op->dyn_cast<pir::ModuleOp>().program());
+
   VisitEachOp(module_op, [&](pir::Operation& op) {
     VisitEachValue(op, [&](pir::Value value) {
-      if (!shape_analysis.HasShapeOrDataForValue(value)) {
+      if (!shape_analysis->HasShapeOrDataForValue(value)) {
         VLOG(4) << "SimplifyDimExpr: shape_analysis can't find ShapeOrData for "
                    "value of the op:"
                 << op.name();
       } else {
         const symbol::ShapeOrDataDimExprs& shape_or_data =
-            shape_analysis.GetShapeOrDataForValue(value);
+            shape_analysis->GetShapeOrDataForValue(value);
+        VLOG(8) << op.name() << "     origin_shape_or_data: " << shape_or_data;
         symbol::ShapeOrDataDimExprs simplified_shape_or_data =
             SimplifyShapeOrData(shape_or_data);
-        shape_analysis.SetShapeOrDataForValue(value, simplified_shape_or_data);
+        VLOG(8) << op.name()
+                << " simplified_shape_or_data: " << simplified_shape_or_data;
+        shape_analysis->SetShapeOrDataForValue(value, simplified_shape_or_data);
       }
     });
+    if (op.num_results() > 0) {
+      pir::shape::SetShapeAttrForOp(
+          &op, shape_analysis->GetShapeOrDataForValue(op.result(0)));
+    } else {
+      pir::shape::SetShapeAttrForOp(
+          &op, shape_analysis->GetShapeOrDataForValue(op.operand_source(0)));
+    }
     // TODO(JiaWenxuan): simplify the attribute "sym_shape_str" of the op
   });
   VLOG(4) << "SimplifyDimExpr end";
@@ -117,10 +132,7 @@ class SimplifyDimExprPass : public pir::Pass {
  public:
   SimplifyDimExprPass() : pir::Pass("simplify_dim_expr_pass", 1) {}
 
-  void Run(pir::Operation* op) override {
-    pir::ModuleOp module_op = op->dyn_cast<pir::ModuleOp>();
-    SimplifyDimExpr(module_op);
-  }
+  void Run(pir::Operation* op) override { SimplifyDimExpr(op); }
 
   bool CanApplyOn(pir::Operation* op) const override {
     return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
index 68372afa3e9ca..bb6a3bbf23bbf 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
@@ -18,6 +18,7 @@
 
 #include "paddle/cinn/common/dim_expr_util.h"
 #include "paddle/cinn/common/union_find.h"
+#include "paddle/pir/include/dialect/shape/ir/shape_attribute.h"
 
 namespace cinn {
 namespace dialect {
@@ -26,11 +27,14 @@ namespace ir {
 namespace {
 
 template <typename DoEachT>
-void VisitEachOp(pir::ModuleOp module_op, const DoEachT& DoEach) {
-  for (uint32_t i = 0; i < module_op->num_regions(); i++) {
-    for (pir::Block& block : module_op->region(i)) {
-      for (pir::Operation& op : block) {
-        DoEach(op);
+void VisitEachOp(pir::Operation* op, const DoEachT& DoEach) {
+  for (uint32_t i = 0; i < op->num_regions(); i++) {
+    for (pir::Block& block : op->region(i)) {
+      for (pir::Operation& sub_op : block) {
+        DoEach(sub_op);
+        if (sub_op.num_regions() > 0) {
+          VisitEachOp(&sub_op, DoEach);
+        }
       }
     }
   }
@@ -133,25 +137,39 @@ std::unordered_map<symbol::DimExpr, symbol::DimExpr> GetDimExprSubstitution(
   return substitution_pattern;
 }
 
-void SubstituteDimExprBasedOnConstraints(pir::ModuleOp module_op) {
+void SubstituteDimExprBasedOnConstraints(pir::Operation* module_op) {
   VLOG(4) << "SubstituteDimExprBasedOnConstraints start";
-  pir::ShapeConstraintIRAnalysis shape_analysis =
-      pir::ShapeAnalysisManager::Instance().Get(module_op.program());
+  pir::ShapeConstraintIRAnalysis* shape_analysis =
+      &pir::ShapeAnalysisManager::Instance().Get(
+          module_op->dyn_cast<pir::ModuleOp>().program());
   const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
-      substitution_pattern = GetDimExprSubstitution(&shape_analysis);
+      substitution_pattern = GetDimExprSubstitution(shape_analysis);
+
   VisitEachOp(module_op, [&](pir::Operation& op) {
     VisitEachValue(op, [&](pir::Value value) {
-      if (!shape_analysis.HasShapeOrDataForValue(value)) {
+      if (!shape_analysis->HasShapeOrDataForValue(value)) {
         VLOG(4) << "Can not find ShapeOrData for value of op(" << op.name()
                 << ") in shape_analysis";
       } else {
         const symbol::ShapeOrDataDimExprs& origin_shape_or_data =
-            shape_analysis.GetShapeOrDataForValue(value);
+            shape_analysis->GetShapeOrDataForValue(value);
+        VLOG(8) << op.name()
+                << "      origin_shape_or_data: " << origin_shape_or_data;
         const symbol::ShapeOrDataDimExprs& substituted_shape_or_data =
             SubstituteShapeOrData(origin_shape_or_data, substitution_pattern);
-        shape_analysis.SetShapeOrDataForValue(value, substituted_shape_or_data);
+        VLOG(8) << op.name()
+                << " substituted_shape_or_data: " << substituted_shape_or_data;
+        shape_analysis->SetShapeOrDataForValue(value,
+                                               substituted_shape_or_data);
       }
     });
+    if (op.num_results() > 0) {
+      pir::shape::SetShapeAttrForOp(
+          &op, shape_analysis->GetShapeOrDataForValue(op.result(0)));
+    } else {
+      pir::shape::SetShapeAttrForOp(
+          &op, shape_analysis->GetShapeOrDataForValue(op.operand_source(0)));
+    }
     // TODO(JiaWenxuan): substitute the attribute "sym_shape_str" of the op
   });
   VLOG(4) << "SubstituteDimExprBasedOnConstraints end";
@@ -163,8 +181,7 @@ class SubstituteDimExprBasedOnConstraintsPass : public pir::Pass {
       : pir::Pass("substitute_dim_expr_based_on_constraints_pass", 1) {}
 
   void Run(pir::Operation* op) override {
-    pir::ModuleOp module_op = op->dyn_cast<pir::ModuleOp>();
-    SubstituteDimExprBasedOnConstraints(module_op);
+    SubstituteDimExprBasedOnConstraints(op);
   }
 
   bool CanApplyOn(pir::Operation* op) const override {

From b726a9060f69f53a5dcf7a676338f899b05a060c Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Thu, 7 Mar 2024 17:23:34 +0800
Subject: [PATCH 059/114] fix adamw loop out int32 bound (#62461)

---
 paddle/phi/kernels/gpu/adam_kernel.cu  | 8 ++++----
 paddle/phi/kernels/gpu/adamw_kernel.cu | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index 5292d7d29c07b..56be43fecb0d1 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -46,12 +46,12 @@ __global__ void AdamKernelREG(MT beta1,
                               T* param_out,
                               const MT* master_param,
                               MT* master_param_out,
-                              int ndim) {
+                              int64_t ndim) {
   MT lr = *lr_;
   MT beta1_pow = beta1_pow_;
   MT beta2_pow = beta2_pow_;
 
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
     MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
@@ -89,12 +89,12 @@ __global__ void AdamKernelMEM(MT beta1,
                               T* param_out,
                               const MT* master_param,
                               MT* master_param_out,
-                              int ndim) {
+                              int64_t ndim) {
   MT lr = *lr_;
   MT beta1_pow = *beta1_pow_;
   MT beta2_pow = *beta2_pow_;
 
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
     MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index d40fdf392b1a2..97d0563d51ff8 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -49,12 +49,12 @@ __global__ void AdamWKernelREG(MT beta1,
                                T* param_out,
                                const MT* master_param,
                                MT* master_param_out,
-                               int ndim) {
+                               int64_t ndim) {
   MT lr = *lr_ * lr_ratio;
   MT beta1_pow = beta1_pow_;
   MT beta2_pow = beta2_pow_;
 
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
     MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
@@ -98,12 +98,12 @@ __global__ void AdamWKernelMEM(MT beta1,
                                T* param_out,
                                const MT* master_param,
                                MT* master_param_out,
-                               int ndim) {
+                               int64_t ndim) {
   MT lr = *lr_ * lr_ratio;
   MT beta1_pow = *beta1_pow_;
   MT beta2_pow = *beta2_pow_;
 
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t id = blockIdx.x * blockDim.x + threadIdx.x;
 
   for (; id < ndim; id += gridDim.x * blockDim.x) {
     MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);

From d95713f858a6e06292d349d23ca1184cafdacdac Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Thu, 7 Mar 2024 17:29:16 +0800
Subject: [PATCH 060/114] [Fix bug](Fix compilation bug in flags.cc) (#62056)

* fix bug

* update
---
 paddle/common/flags.h         | 13 -------------
 paddle/common/flags_native.cc | 12 ++++++++++++
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/paddle/common/flags.h b/paddle/common/flags.h
index b9ca1a52c4c63..006f2fea5355d 100644
--- a/paddle/common/flags.h
+++ b/paddle/common/flags.h
@@ -122,19 +122,6 @@ PADDLE_API void ParseCommandLineFlags(int* argc, char*** argv);
  */
 PADDLE_API void AllowUndefinedFlags();
 
-/**
- * @brief Set flags from environment variables.
- *
- * It recieves a list of flags name, and will find the corresponding environment
- * variables named "FLAGS_name", if found, it will set the environment variable
- * values to the flags. If error_fatal is true, the program will exit when the
- * environment variable is not set or the flag is not defined, that is the same
- * effect as using commandline argument "--fromenv=var_name1,var_name2,...".
- * Otherwise, the errors above will be ignored, that is the same effect as using
- * commandline argument "--tryfromenv=var_name1,var_name2,...".
- */
-void SetFlagsFromEnv(const std::vector<std::string>& flags, bool error_fatal);
-
 /**
  * @brief Set Single flag value, return true if success.
  */
diff --git a/paddle/common/flags_native.cc b/paddle/common/flags_native.cc
index 8229c6b0f0b1d..706419721d96f 100644
--- a/paddle/common/flags_native.cc
+++ b/paddle/common/flags_native.cc
@@ -362,6 +362,18 @@ bool GetValueFromEnv(const std::string& name, std::string* value) {
   return true;
 }
 
+/**
+ * @brief Set flags from environment variables.
+ *
+ * It recieves a list of flags name, and will find the corresponding environment
+ * variables named "FLAGS_name", if found, it will set the environment variable
+ * values to the flags. If error_fatal is true, the program will exit when the
+ * environment variable is not set or the flag is not defined, that is the same
+ * effect as using commandline argument "--fromenv=var_name1,var_name2,...".
+ * Otherwise, the errors above will be ignored, that is the same effect as using
+ * commandline argument "--tryfromenv=var_name1,var_name2,...".
+ */
+
 void SetFlagsFromEnv(const std::vector<std::string>& flags, bool error_fatal) {
   bool success = true;
   for (const std::string& flag_name : flags) {

From 8e8eb404aa231487e26e38062587b041f1ddb991 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Thu, 7 Mar 2024 17:34:46 +0800
Subject: [PATCH 061/114] Fix yiled yield, etc (#62457)

---
 .../transforms/cinn_group_cluster_pass.cc        |  4 ++--
 .../divide_group_op_to_fusion_op_pass.cc         | 10 +++++-----
 .../group_merge/group_with_group_merge_pass.cc   | 16 ++++++++--------
 .../default_horizontal_fuse_pass.cc              |  2 +-
 .../default_input_fuse_pass.cc                   |  2 +-
 .../default_recompute_fuse_pass.cc               |  2 +-
 .../default_vertical_fuse_pass.cc                |  4 ++--
 .../horizontal_fuse_util.h                       |  2 +-
 .../vertical_fuse_util.h                         |  2 +-
 9 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
index 1c4e842b79bd7..62c7eeccc6c9e 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
@@ -728,7 +728,7 @@ std::vector<GroupClusterNode> OpMergeWithOp(cinn::dialect::GroupOp group_op) {
     if (yield_output_ops.count(op) ||
         cinn::hlir::framework::pir::CompatibleInfo::OpKind(*op) ==
             cinn::hlir::framework::kReduction) {
-      // TODO(phlrain): yiled output no nedd to push into first stage output,
+      // TODO(phlrain): yield output no nedd to push into first stage output,
       // Update here
       if (!first_output_ops.count(op)) {
         first_stage_output.push_back(op_path[op]);
@@ -846,7 +846,7 @@ class CinnGroupClusterPattern
         auto find_it = all_output_values.find(output_values[i]);
         if ((find_it != all_output_values.end()) &&
             (find_it->second < group_op->num_results())) {
-          // id < num_results means yiled input
+          // id < num_results means yield input
           rewriter.ReplaceAllUsesWith(group_op.result(find_it->second),
                                       new_group_op->result(i));
         }
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.cc
index 886cc29efa5b1..70b9bd106d077 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.cc
@@ -124,13 +124,13 @@ class GroupOpPattern : public pir::OpRewritePattern<cinn::dialect::GroupOp> {
     auto& shape_analysis =
         pir::ShapeAnalysisManager::Instance().Get(group_op->GetParentProgram());
     // Record map info for yield value to each fusion_op's result
-    std::unordered_map<::pir::Value, ::pir::Value> fusion_yiled_values;
+    std::unordered_map<::pir::Value, ::pir::Value> fusion_yield_values;
 
     const auto& TryReplaceOperandSource = [&](::pir::Operation* op) {
       for (auto& operand : op->operands()) {
         const auto value = operand.source();
-        if (fusion_yiled_values.find(value) != fusion_yiled_values.end()) {
-          operand.set_source(fusion_yiled_values.at(value));
+        if (fusion_yield_values.find(value) != fusion_yield_values.end()) {
+          operand.set_source(fusion_yield_values.at(value));
         }
       }
     };
@@ -158,9 +158,9 @@ class GroupOpPattern : public pir::OpRewritePattern<cinn::dialect::GroupOp> {
       auto fusion_op = CreateFusionOp(vec_outs, group);
 
       for (size_t i = 0; i < fusion_op.num_results(); ++i) {
-        CHECK(fusion_yiled_values.insert({vec_outs[i], fusion_op.result(i)})
+        CHECK(fusion_yield_values.insert({vec_outs[i], fusion_op.result(i)})
                   .second)
-            << "fusion_yiled_values already has key!";
+            << "fusion_yield_values already has key!";
         const auto& shape_expr =
             shape_analysis.GetShapeOrDataForValue(vec_outs[i]);
         shape_analysis.SetShapeOrDataForValue(fusion_op.result(i), shape_expr);
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
index 81606a320cdcc..5c3e9a9670ced 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
@@ -431,7 +431,7 @@ template <typename FusePassCtxT>
 struct HorizontalFuseUtil {
   using KindKeyT = std::pair<OpPatternKind, OpPatternKind>;
 
-  static bool DetectFusabilityByKind(FusePassCtxT* ctx,
+  static bool DetectFusibilityByKind(FusePassCtxT* ctx,
                                      const OpGroupPtr& src,
                                      const OpGroupPtr& dst) {
     const KindKeyT kind_pair(src.kind(), dst.kind());
@@ -590,7 +590,7 @@ class DefaultInputFusePass final : public InputFusePass {
       bool fusionable = false;
       for (auto& groups : fusionable_consumers) {
         auto& last = groups.back();
-        if (!HorizontalFuseUtil<InputFusePassCtx>::DetectFusabilityByKind(
+        if (!HorizontalFuseUtil<InputFusePassCtx>::DetectFusibilityByKind(
                 ctx, candidate, last)) {
           continue;
         }
@@ -681,7 +681,7 @@ class DefaultHorizontalFusePass final : public HorizontalFusePass {
       bool fusionable = false;
       for (auto& groups : fusionable_consumers) {
         auto& last = groups.back();
-        if (!HorizontalFuseUtil<LightwareFusePassCtx>::DetectFusabilityByKind(
+        if (!HorizontalFuseUtil<LightwareFusePassCtx>::DetectFusibilityByKind(
                 ctx, candidate, last)) {
           continue;
         }
@@ -752,7 +752,7 @@ class DefaultVerticalFusePass final : public VerticalFusePass {
     std::vector<OpGroupPtr> candidates;
     for (size_t i = 0; i < consumers.size(); ++i) {
       const auto& consumer = consumers.at(i);
-      if (!DetectFusabilityByKind(ctx, producer, consumer)) {
+      if (!DetectFusibilityByKind(ctx, producer, consumer)) {
         break;
       }
       candidates.push_back(consumer);
@@ -764,7 +764,7 @@ class DefaultVerticalFusePass final : public VerticalFusePass {
 
     for (size_t i = 0; i < consumers.size(); ++i) {
       const auto& consumer = consumers.at(i);
-      if (!DetectFusabilityByKind(ctx, producer, consumer)) {
+      if (!DetectFusibilityByKind(ctx, producer, consumer)) {
         continue;
       }
       if (ctx->fuse_helper().DetectCycleIfFuse(producer, consumer)) {
@@ -776,7 +776,7 @@ class DefaultVerticalFusePass final : public VerticalFusePass {
   }
 
   using KindKeyT = std::pair<OpPatternKind, OpPatternKind>;
-  bool DetectFusabilityByKind(LightwareFusePassCtx* ctx,
+  bool DetectFusibilityByKind(LightwareFusePassCtx* ctx,
                               const OpGroupPtr& src,
                               const OpGroupPtr& dst) const {
     const KindKeyT kind_pair(src.kind(), dst.kind());
@@ -941,7 +941,7 @@ class DefaultRecomputeFusePass final : public RecomputeFusePass {
     std::vector<OpGroupPtr> candidates;
     for (size_t i = 0; i < consumers.size(); ++i) {
       const auto& consumer = consumers.at(i);
-      if (!DetectFusabilityByKind(ctx, producer, consumer)) {
+      if (!DetectFusibilityByKind(ctx, producer, consumer)) {
         continue;
       }
       unsafe_candidates.push_back(consumer);
@@ -960,7 +960,7 @@ class DefaultRecomputeFusePass final : public RecomputeFusePass {
   }
 
   using KindKeyT = std::pair<OpPatternKind, OpPatternKind>;
-  bool DetectFusabilityByKind(LightwareFusePassCtx* ctx,
+  bool DetectFusibilityByKind(LightwareFusePassCtx* ctx,
                               const OpGroupPtr& src,
                               const OpGroupPtr& dst) const {
     const KindKeyT kind_pair(src.kind(), dst.kind());
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_horizontal_fuse_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_horizontal_fuse_pass.cc
index e953caf20ab7a..642ad8acf6aec 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_horizontal_fuse_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_horizontal_fuse_pass.cc
@@ -62,7 +62,7 @@ class DefaultHorizontalFusePass final : public HorizontalFusePass {
       bool fusionable = false;
       for (auto& groups : fusionable_consumers) {
         auto& last = groups.back();
-        if (!HorizontalFuseUtil<LightwareFusePassCtx>::DetectFusabilityByKind(
+        if (!HorizontalFuseUtil<LightwareFusePassCtx>::DetectFusibilityByKind(
                 ctx, candidate, last)) {
           continue;
         }
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_input_fuse_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_input_fuse_pass.cc
index 7dc68d65599f9..1f251af14e212 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_input_fuse_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_input_fuse_pass.cc
@@ -63,7 +63,7 @@ class DefaultInputFusePass final : public InputFusePass {
       bool fusionable = false;
       for (auto& groups : fusionable_consumers) {
         auto& last = groups.back();
-        if (!HorizontalFuseUtil<InputFusePassCtx>::DetectFusabilityByKind(
+        if (!HorizontalFuseUtil<InputFusePassCtx>::DetectFusibilityByKind(
                 ctx, candidate, last)) {
           continue;
         }
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_recompute_fuse_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_recompute_fuse_pass.cc
index 137a470d5993d..c1eab18569a8c 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_recompute_fuse_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_recompute_fuse_pass.cc
@@ -44,7 +44,7 @@ class DefaultRecomputeFusePass final : public RecomputeFusePass {
     std::vector<OpGroupPtr> candidates;
     for (int i = 0; i < consumers.size(); ++i) {
       const auto& consumer = consumers.at(i);
-      if (!VerticalFuseUtil::DetectFusabilityByKind(ctx, producer, consumer)) {
+      if (!VerticalFuseUtil::DetectFusibilityByKind(ctx, producer, consumer)) {
         continue;
       }
       unsafe_candidates.push_back(consumer);
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_vertical_fuse_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_vertical_fuse_pass.cc
index fcffcb6be03f8..eb74a622db21d 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_vertical_fuse_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_vertical_fuse_pass.cc
@@ -46,7 +46,7 @@ class DefaultVerticalFusePass final : public VerticalFusePass {
     std::vector<OpGroupPtr> candidates;
     for (int i = 0; i < consumers.size(); ++i) {
       const auto& consumer = consumers.at(i);
-      if (!VerticalFuseUtil::DetectFusabilityByKind(ctx, producer, consumer)) {
+      if (!VerticalFuseUtil::DetectFusibilityByKind(ctx, producer, consumer)) {
         break;
       }
       candidates.push_back(consumer);
@@ -58,7 +58,7 @@ class DefaultVerticalFusePass final : public VerticalFusePass {
 
     for (int i = 0; i < consumers.size(); ++i) {
       const auto& consumer = consumers.at(i);
-      if (!VerticalFuseUtil::DetectFusabilityByKind(ctx, producer, consumer)) {
+      if (!VerticalFuseUtil::DetectFusibilityByKind(ctx, producer, consumer)) {
         continue;
       }
       if (ctx->fuse_helper().DetectCycleIfFuse(producer, consumer)) {
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/horizontal_fuse_util.h b/paddle/cinn/hlir/pass/general_fusion_merge_pass/horizontal_fuse_util.h
index 81b170637e54d..56612879b6770 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/horizontal_fuse_util.h
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/horizontal_fuse_util.h
@@ -29,7 +29,7 @@ template <typename FusePassCtxT>
 struct HorizontalFuseUtil {
   using KindKeyT = std::pair<OpPatternKind, OpPatternKind>;
 
-  static bool DetectFusabilityByKind(FusePassCtxT* ctx,
+  static bool DetectFusibilityByKind(FusePassCtxT* ctx,
                                      const OpGroupPtr& src,
                                      const OpGroupPtr& dst) {
     const KindKeyT kind_pair(src.kind(), dst.kind());
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/vertical_fuse_util.h b/paddle/cinn/hlir/pass/general_fusion_merge_pass/vertical_fuse_util.h
index 4845af9ea94eb..9c754d59bac42 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/vertical_fuse_util.h
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/vertical_fuse_util.h
@@ -29,7 +29,7 @@ using framework::OpPatternKind;
 struct VerticalFuseUtil {
   using KindKeyT = std::pair<OpPatternKind, OpPatternKind>;
 
-  static bool DetectFusabilityByKind(LightwareFusePassCtx* ctx,
+  static bool DetectFusibilityByKind(LightwareFusePassCtx* ctx,
                                      const OpGroupPtr& src,
                                      const OpGroupPtr& dst) {
     const KindKeyT kind_pair(src.kind(), dst.kind());

From 9cc505e1e7f0ac3f0600a06758ffd45beb130b57 Mon Sep 17 00:00:00 2001
From: liuzhenhai93 <liuzhenhai93@outlook.com>
Date: Thu, 7 Mar 2024 20:04:17 +0800
Subject: [PATCH 062/114] Fix semi static  split with section  op  (#62516)

* polish

* polish
---
 .../distributed/auto_parallel/static/operators/dist_split.py  | 4 ++--
 python/paddle/nn/functional/loss.py                           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_split.py b/python/paddle/distributed/auto_parallel/static/operators/dist_split.py
index fff9294696875..25e3a776fe4d4 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_split.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_split.py
@@ -49,7 +49,7 @@ def update_dims_mapping(dist_op):
 
         num = op_desc.attr('num')
         sections = op_desc.attr('sections')
-        if num is not None:
+        if num:
             assert (sections is None) or (
                 len(sections) == 0
             ), f"Both Attributes of num: {num} and sections: {sections} are specified."
@@ -57,7 +57,7 @@ def update_dims_mapping(dist_op):
             rule_type = "split_with_num"
         else:
             assert (
-                num is None
+                not num
             ), f"Both Attributes of num: {num} and sections: {sections} are specified."
             first_attr = sections
             rule_type = "split"
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 446eb7d62a2f5..5741f0a643db0 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2945,7 +2945,7 @@ def cross_entropy(
         check_variable_and_dtype(
             input,
             'input',
-            ['float16', 'float32', 'float64'],
+            ['uint16', 'float16', 'float32', 'float64'],
             'softmax_cross_entropy',
         )
         check_variable_and_dtype(

From 24777d45e3411ec117a8f72aa8a167620996c38b Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Fri, 8 Mar 2024 09:47:15 +0800
Subject: [PATCH 063/114] delete IR_ENFORCE (#62515)

---
 .../fluid/pir/drr/src/ir_operation_factory.cc | 208 ++++++++++--------
 1 file changed, 113 insertions(+), 95 deletions(-)

diff --git a/paddle/fluid/pir/drr/src/ir_operation_factory.cc b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
index de796c50e67d3..14c91e20e6f40 100644
--- a/paddle/fluid/pir/drr/src/ir_operation_factory.cc
+++ b/paddle/fluid/pir/drr/src/ir_operation_factory.cc
@@ -66,111 +66,129 @@ void OperationFactory::RegisterManualOpCreator() {
       });
 
 #ifdef PADDLE_WITH_DNNL
-  op_creator_map["onednn_op.conv2d_transpose_bias"] =
-      [](const std::vector<pir::Value>& inputs,
-         const pir::AttributeMap& attrs,
-         pir::PatternRewriter& rewriter) {
-        if (inputs.size() == 4) {
-          IR_ENFORCE(
-              attrs.find("strides") != attrs.end(),
-              "'strides' Attribute is expected for Conv2dTransposeBiasOp. ");
-          std::vector<int> strides;
-          for (size_t i = 0;
-               i < attrs.at("strides").dyn_cast<pir::ArrayAttribute>().size();
-               i++) {
-            strides.push_back(attrs.at("strides")
-                                  .dyn_cast<pir::ArrayAttribute>()
-                                  .at(i)
-                                  .dyn_cast<pir::Int32Attribute>()
-                                  .data());
-          }
+  op_creator_map["onednn_op.conv2d_transpose_bias"] = [](const std::vector<
+                                                             pir::Value>&
+                                                             inputs,
+                                                         const pir::
+                                                             AttributeMap&
+                                                                 attrs,
+                                                         pir::PatternRewriter&
+                                                             rewriter) {
+    if (inputs.size() == 4) {
+      PADDLE_ENFORCE_EQ(
+          attrs.find("strides") != attrs.end(),
+          true,
+          phi::errors::InvalidArgument(
+              "'strides' Attribute is expected for Conv2dTransposeBiasOp. "));
+      std::vector<int> strides;
+      for (size_t i = 0;
+           i < attrs.at("strides").dyn_cast<pir::ArrayAttribute>().size();
+           i++) {
+        strides.push_back(attrs.at("strides")
+                              .dyn_cast<pir::ArrayAttribute>()
+                              .at(i)
+                              .dyn_cast<pir::Int32Attribute>()
+                              .data());
+      }
 
-          IR_ENFORCE(
-              attrs.find("paddings") != attrs.end(),
-              "'paddings' Attribute is expected for Conv2dTransposeBiasOp. ");
-          std::vector<int> paddings;
-          for (size_t i = 0;
-               i < attrs.at("paddings").dyn_cast<pir::ArrayAttribute>().size();
-               i++) {
-            paddings.push_back(attrs.at("paddings")
-                                   .dyn_cast<pir::ArrayAttribute>()
-                                   .at(i)
-                                   .dyn_cast<pir::Int32Attribute>()
-                                   .data());
-          }
+      PADDLE_ENFORCE_EQ(
+          attrs.find("paddings") != attrs.end(),
+          true,
+          phi::errors::InvalidArgument(
+              "'paddings' Attribute is expected for Conv2dTransposeBiasOp. "));
+      std::vector<int> paddings;
+      for (size_t i = 0;
+           i < attrs.at("paddings").dyn_cast<pir::ArrayAttribute>().size();
+           i++) {
+        paddings.push_back(attrs.at("paddings")
+                               .dyn_cast<pir::ArrayAttribute>()
+                               .at(i)
+                               .dyn_cast<pir::Int32Attribute>()
+                               .data());
+      }
 
-          IR_ENFORCE(attrs.find("output_padding") != attrs.end(),
-                     "'output_padding' Attribute is expected for "
-                     "Conv2dTransposeBiasOp. ");
-          std::vector<int> output_padding;
-          for (size_t i = 0; i < attrs.at("output_padding")
+      PADDLE_ENFORCE_EQ(attrs.find("output_padding") != attrs.end(),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "'output_padding' Attribute is expected for "
+                            "Conv2dTransposeBiasOp. "));
+      std::vector<int> output_padding;
+      for (size_t i = 0;
+           i <
+           attrs.at("output_padding").dyn_cast<pir::ArrayAttribute>().size();
+           i++) {
+        output_padding.push_back(attrs.at("output_padding")
                                      .dyn_cast<pir::ArrayAttribute>()
-                                     .size();
-               i++) {
-            output_padding.push_back(attrs.at("output_padding")
-                                         .dyn_cast<pir::ArrayAttribute>()
-                                         .at(i)
-                                         .dyn_cast<pir::Int32Attribute>()
-                                         .data());
-          }
+                                     .at(i)
+                                     .dyn_cast<pir::Int32Attribute>()
+                                     .data());
+      }
 
-          IR_ENFORCE(attrs.find("padding_algorithm") != attrs.end(),
-                     "'padding_algorithm' Attribute is expected for "
-                     "Conv2dTransposeBiasOp. ");
-          std::string padding_algorithm = attrs.at("padding_algorithm")
-                                              .dyn_cast<pir::StrAttribute>()
-                                              .AsString();
+      PADDLE_ENFORCE_EQ(attrs.find("padding_algorithm") != attrs.end(),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "'padding_algorithm' Attribute is expected for "
+                            "Conv2dTransposeBiasOp. "));
+      std::string padding_algorithm = attrs.at("padding_algorithm")
+                                          .dyn_cast<pir::StrAttribute>()
+                                          .AsString();
 
-          IR_ENFORCE(
-              attrs.find("groups") != attrs.end(),
-              "'groups' Attribute is expected for Conv2dTransposeBiasOp. ");
-          int groups =
-              attrs.at("groups").dyn_cast<pir::Int32Attribute>().data();
+      PADDLE_ENFORCE_EQ(
+          attrs.find("groups") != attrs.end(),
+          true,
+          phi::errors::InvalidArgument(
+              "'groups' Attribute is expected for Conv2dTransposeBiasOp. "));
+      int groups = attrs.at("groups").dyn_cast<pir::Int32Attribute>().data();
 
-          IR_ENFORCE(
-              attrs.find("dilations") != attrs.end(),
-              "'dilations' Attribute is expected for Conv2dTransposeBiasOp. ");
-          std::vector<int> dilations;
-          for (size_t i = 0;
-               i < attrs.at("dilations").dyn_cast<pir::ArrayAttribute>().size();
-               i++) {
-            dilations.push_back(attrs.at("dilations")
-                                    .dyn_cast<pir::ArrayAttribute>()
-                                    .at(i)
-                                    .dyn_cast<pir::Int32Attribute>()
-                                    .data());
-          }
+      PADDLE_ENFORCE_EQ(
+          attrs.find("dilations") != attrs.end(),
+          true,
+          phi::errors::InvalidArgument(
+              "'dilations' Attribute is expected for Conv2dTransposeBiasOp. "));
+      std::vector<int> dilations;
+      for (size_t i = 0;
+           i < attrs.at("dilations").dyn_cast<pir::ArrayAttribute>().size();
+           i++) {
+        dilations.push_back(attrs.at("dilations")
+                                .dyn_cast<pir::ArrayAttribute>()
+                                .at(i)
+                                .dyn_cast<pir::Int32Attribute>()
+                                .data());
+      }
 
-          IR_ENFORCE(attrs.find("data_format") != attrs.end(),
-                     "'data_format' Attribute is expected for "
-                     "Conv2dTransposeBiasOp. ");
-          std::string data_format =
-              attrs.at("data_format").dyn_cast<pir::StrAttribute>().AsString();
+      PADDLE_ENFORCE_EQ(attrs.find("data_format") != attrs.end(),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "'data_format' Attribute is expected for "
+                            "Conv2dTransposeBiasOp. "));
+      std::string data_format =
+          attrs.at("data_format").dyn_cast<pir::StrAttribute>().AsString();
 
-          IR_ENFORCE(
-              attrs.find("is_test") != attrs.end(),
-              "'is_test' Attribute is expected for Conv2dTransposeBiasOp. ");
-          bool is_test =
-              attrs.at("is_test").dyn_cast<pir::BoolAttribute>().data();
+      PADDLE_ENFORCE_EQ(
+          attrs.find("is_test") != attrs.end(),
+          true,
+          phi::errors::InvalidArgument(
+              "'is_test' Attribute is expected for Conv2dTransposeBiasOp. "));
+      bool is_test = attrs.at("is_test").dyn_cast<pir::BoolAttribute>().data();
 
-          return rewriter.Build<paddle::onednn::dialect::Conv2dTransposeBiasOp>(
-              inputs[0],
-              inputs[1],
-              inputs[2],
-              inputs[3],
-              strides,
-              paddings,
-              output_padding,
-              padding_algorithm,
-              groups,
-              dilations,
-              data_format,
-              is_test);
-        }
+      return rewriter.Build<paddle::onednn::dialect::Conv2dTransposeBiasOp>(
+          inputs[0],
+          inputs[1],
+          inputs[2],
+          inputs[3],
+          strides,
+          paddings,
+          output_padding,
+          padding_algorithm,
+          groups,
+          dilations,
+          data_format,
+          is_test);
+    }
 
-        return rewriter.Build<paddle::onednn::dialect::Conv2dTransposeBiasOp>(
-            inputs[0], inputs[1], inputs[2], attrs);
-      };
+    return rewriter.Build<paddle::onednn::dialect::Conv2dTransposeBiasOp>(
+        inputs[0], inputs[1], inputs[2], attrs);
+  };
 #endif
 }
 

From 7b1540aa486c4668d78e4a5fb8bb619f5a499647 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Fri, 8 Mar 2024 09:51:11 +0800
Subject: [PATCH 064/114] group cluster support control flow (#62523)

---
 .../hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
index 62c7eeccc6c9e..542f73cb0811e 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
@@ -872,7 +872,7 @@ class CinnGroupClusterPass : public pir::PatternRewritePass {
   }
 
   bool CanApplyOn(pir::Operation* op) const override {
-    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
+    return op->num_regions() > 0;
   }
 };
 

From 3646da6020f72da65b3c5cb7c87361a22703825c Mon Sep 17 00:00:00 2001
From: Ghost Screaming <mofengshenjieII@163.com>
Date: Fri, 8 Mar 2024 10:25:01 +0800
Subject: [PATCH 065/114] [AutoParallel] Fix problem of expand_as. (#62460)

* [AutoParallel] Fix problem of expand_as. It needs to calculate local shape in auto parallel dynamic graph mode.

* Remove useless print.

* Polish code according to comments.
---
 .../fluid/operators/generator/parse_utils.py  |  2 +-
 paddle/phi/api/yaml/generator/dist_api_gen.py | 93 +++++++++++--------
 paddle/phi/api/yaml/legacy_ops.yaml           |  1 +
 paddle/phi/api/yaml/ops.yaml                  |  1 +
 4 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/operators/generator/parse_utils.py b/paddle/fluid/operators/generator/parse_utils.py
index 0370d6cfba4b3..38a87efec0415 100644
--- a/paddle/fluid/operators/generator/parse_utils.py
+++ b/paddle/fluid/operators/generator/parse_utils.py
@@ -369,7 +369,7 @@ def check_op_config(op_entry, op_name):
         'traits',
         'interfaces',
     )
-    infer_meta_key_set = ('func', 'param', 'spmd_rule')
+    infer_meta_key_set = ('func', 'param', 'spmd_rule', 'local_shape')
     kernel_key_set = (
         'func',
         'param',
diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
index d0b82f3be9f70..ad153639c4d56 100644
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -483,53 +483,56 @@
     // API `{}` does not need to set DistAttr for output."""
 
 # TODO(GhostScreaming): Support aliquant condition.
-# Specialized Code, for example, reshape needs to calculate local_shape
-RESHAPE_CALCULATE_LOCAL_SHAPE_TEMPLATE = """
+# Operators like `reshape`, `expand_as` need to calculate local_shape
+# for their local `DenseTensor`, as the given shape in their attribute
+# is global_shape for `DistTensor`.
+CALCULATE_LOCAL_SHAPE_TEMPLATE = """
 
       // The dist_input_x is a dist tensor, the dims() func return the global dims.
       auto x_shape = dist_input_x->dims();
       auto x_numel = dist_input_x->numel();
       bool visit_negative = false;
-      std::vector<int64_t> local_shape;
-      for (size_t i = 0; i < shape.GetData().size(); i++) {
+      auto global_shape = {shape};
+      std::vector<{dtype}> local_shape;
+      for (size_t i = 0; i < global_shape.size(); i++) {{
         auto& out_dist_attr = PADDLE_GET_CONST(phi::distributed::TensorDistAttr, spmd_info.second[0]);
-        if (out_dist_attr.dims_mapping()[i] >= 0) {
-          int64_t shape_i = shape.GetData()[i];
-          if (shape_i == 0) {
+        if (out_dist_attr.dims_mapping()[i] >= 0) {{
+          {dtype} shape_i = global_shape[i];
+          if (shape_i == 0) {{
             shape_i = x_shape[i];
-          } else if (shape_i == -1) {
+          }} else if (shape_i == -1) {{
             PADDLE_ENFORCE(not visit_negative,
                            phi::errors::InvalidArgument(
-                               "Reshape can only have one -1 in the shape."));
+                               "{op_name} can only have one -1 in the {shape_name}."));
             visit_negative = true;
             int64_t non_negative_product = 1;
-            for (size_t j = 0; j < shape.GetData().size(); j++) {
-              if (i == j) {
+            for (size_t j = 0; j < global_shape.size(); j++) {{
+              if (i == j) {{
                 continue;
-              }
-              int64_t tmp_j = shape.GetData()[j];
-              if (tmp_j == 0) {
+              }}
+              int64_t tmp_j = global_shape[j];
+              if (tmp_j == 0) {{
                 tmp_j = x_shape[j];
-              }
+              }}
               non_negative_product *= tmp_j;
-            }
+            }}
             PADDLE_ENFORCE(x_numel % non_negative_product == 0,
                            phi::errors::InvalidArgument("Cannot infer real shape for -1."));
             shape_i = x_numel / non_negative_product;
-          }
+          }}
           int64_t dim = out_dist_attr.dims_mapping()[i];
           int64_t mesh_dim = out_dist_attr.process_mesh().shape()[dim];
           // TODO: Support aliquant condition.
           PADDLE_ENFORCE(shape_i % mesh_dim == 0,
                 phi::errors::InvalidArgument(
-                    "Reshape only support local shape dim is divisible "
+                    "{op_name} only support local shape dim is divisible "
                     "by the mesh dim, however local_shape[%lld] is %lld "
                     "and shard mesh dims is %lld.", i, shape_i, mesh_dim));
           local_shape.push_back(shape_i / mesh_dim);
-        } else {
-          local_shape.push_back(shape.GetData()[i]);
-        }
-      }
+        }} else {{
+          local_shape.push_back({shape}[i]);
+        }}
+      }}
 """
 
 # BaseAPI members:
@@ -590,7 +593,11 @@ def parse_infer_meta(self, infer_meta_config):
             infer_meta['param'] = None
         if 'spmd_rule' not in infer_meta_config:
             infer_meta['spmd_rule'] = None
-
+        # Operators like `reshape`, `expand_as` need to calculate local_shape
+        # for their local `DenseTensor`, as the given shape in their attribute
+        # is global_shape for `DistTensor`.
+        if 'local_shape' not in infer_meta_config:
+            infer_meta['local_shape'] = None
         return infer_meta
 
     def need_to_generate_code_for_inplace_impl(self, i):
@@ -613,17 +620,6 @@ def need_to_generate_code_for_inplace_or_view_impl(self, i):
             i
         ) or self.need_to_generate_code_for_view_impl(i)
 
-    # # view output is also inlace, such case still needs
-    # # to create an empty DenseTensor for inplace output in pp
-    # def need_to_set_inplace_output_for_pp_impl(self, i):
-    #     return (not self.need_to_generate_code_for_view_impl(i)) and self.is_inplace_output(i)
-
-    def is_reshape_kernel(self):
-        return (
-            "reshape" in self.kernel['func'][0]
-            and 'grad' not in self.kernel['func'][0]
-        )
-
     def is_inplace_output(self, i):
         return self.outputs['names'][i] in self.inplace_map
 
@@ -1548,8 +1544,8 @@ def generate_infer_meta_code(self) -> str:
                         f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
                     )
             elif param in attr_names:
-                # TODO(GhostScreaming): reshape kernel need specialized process
-                if self.is_reshape_kernel() and param == "shape":
+                # TODO(GhostScreaming): kernel like reshape need calculate local_shape
+                if self.infer_meta['local_shape'] is not None:
                     input_args_code = input_args_code + "local_shape" + ", "
                 else:
                     input_args_code = input_args_code + param + ", "
@@ -1582,9 +1578,24 @@ def generate_infer_meta_code(self) -> str:
         output_args_code = output_args_code[:-2]
 
         infer_meta_code = ""
-        # TODO(GhostScreaming): reshape kernel need specialized process
-        if self.is_reshape_kernel():
-            infer_meta_code = RESHAPE_CALCULATE_LOCAL_SHAPE_TEMPLATE
+        # TODO(GhostScreaming): kernel like reshape need calculate local_shape
+        if self.infer_meta['local_shape'] is not None:
+            shape_name = self.infer_meta['local_shape']
+            assert (
+                shape_name in self.attrs['names']
+            ), f"Auto Parallel will calculate local_shape {shape_name} for"
+            "operator {self.kernel['func'][0]}, but {shape_name} is not"
+            "found in its attributes."
+            shape_type = self.attrs['attr_info'][shape_name][0]
+
+            infer_meta_code = CALCULATE_LOCAL_SHAPE_TEMPLATE.format(
+                shape=f"{shape_name}.GetData()"
+                if shape_type == "IntArray"
+                else f"{shape_name}",
+                dtype="int64_t" if shape_type == "IntArray" else "int",
+                op_name=self.kernel['func'][0],
+                shape_name=shape_name,
+            )
         infer_meta_code = infer_meta_code + INFER_META_TEMPLATE.format(
             infer_meta_func_code, input_args_code, output_args_code
         )
@@ -1637,8 +1648,8 @@ def generate_kernel_call_code(self) -> str:
             elif arg in attr_names:
                 if 'IntArray' in self.attrs['attr_info'][arg][0]:
                     kernel_args_type_list.append('const phi::IntArray&')
-                    # TODO(GhostScreaming): reshape kernel need specialized process
-                    if self.is_reshape_kernel() and arg == "shape":
+                    # TODO(GhostScreaming): kernel like reshape need calculate local_shape
+                    if self.infer_meta['local_shape'] is not None:
                         arg = 'phi::IntArray(local_shape)'
                     else:
                         arg = 'phi::IntArray(' + arg + ')'
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index a629ab70cd109..e27e5de111bc8 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1005,6 +1005,7 @@
   infer_meta :
     func : ReshapeWithXShapeInferMeta
     spmd_rule : ReshapeInferSpmdDynamic
+    local_shape: shape
   kernel :
     func : reshape
   inplace : (x -> out)
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 35ccab6221eb6..ce7d9e935247d 100755
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -946,6 +946,7 @@
   output : Tensor(out)
   infer_meta :
     func : ExpandAsInferMeta
+    local_shape: target_shape
   kernel :
     func : expand_as
     data_type : x

From 70cd811c622a4c83b79d2eda7bff8a6c407583f9 Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:39:11 +0800
Subject: [PATCH 066/114] [Auto Parallel] Add spmd rule for scatter_grad and
 gather_grad (#62099)

* add scatter_grad spmd rule

* add gather_grad spmd rule

* bug fix
---
 paddle/phi/infermeta/spmd_rules/gather.cc  |  41 ++++++
 paddle/phi/infermeta/spmd_rules/gather.h   |   5 +
 paddle/phi/infermeta/spmd_rules/scatter.cc |  37 ++++++
 paddle/phi/infermeta/spmd_rules/scatter.h  |   4 +
 test/cpp/auto_parallel/spmd_rule_test.cc   | 142 +++++++++++++++++++++
 5 files changed, 229 insertions(+)

diff --git a/paddle/phi/infermeta/spmd_rules/gather.cc b/paddle/phi/infermeta/spmd_rules/gather.cc
index c8fae74253e8c..014c5f358dd73 100644
--- a/paddle/phi/infermeta/spmd_rules/gather.cc
+++ b/paddle/phi/infermeta/spmd_rules/gather.cc
@@ -174,5 +174,46 @@ SpmdInfo GatherInferSpmdReverseDynamic(const DistMetaTensor& x,
   return GatherInferSpmdReverseBase(x, index, out, axis.to<int32_t>());
 }
 
+SpmdInfo GatherGradInferSpmd(const DistMetaTensor& x,
+                             const DistMetaTensor& index,
+                             const DistMetaTensor& out_grad,
+                             const Scalar& axis) {
+  EXTRACT_SHAPE_AND_DIST_ATTR(x);
+  EXTRACT_SHAPE_AND_DIST_ATTR(out_grad);
+  auto index_shape = common::vectorize(index.dims());
+  int index_ndim = index_shape.size();
+  TensorDistAttr index_dist_attr_src = index.dist_attr();
+  std::vector<int64_t> index_dims_mapping_src =
+      index_dist_attr_src.dims_mapping();
+  int axis_ = axis.to<int32_t>();
+
+  // TODO(zhangyichen): support shard on index and out_grad[axis]
+  std::vector<int64_t> out_grad_dims_mapping_dst(out_grad_dims_mapping_src);
+  TensorDistAttr out_grad_dist_attr_dst(out_grad_dist_attr_src);
+  if (index_ndim == 0) {
+    out_grad_dims_mapping_dst.insert(out_grad_dims_mapping_dst.begin() + axis_,
+                                     -1);
+  } else {
+    out_grad_dims_mapping_dst[axis_] = -1;
+    out_grad_dist_attr_dst.set_dims_mapping(out_grad_dims_mapping_dst);
+  }
+
+  std::vector<int64_t> index_dims_mapping_dst(index_dims_mapping_src);
+  TensorDistAttr index_dist_attr_dst(index_dims_mapping_src);
+  index_dims_mapping_dst[axis_] = -1;
+  index_dist_attr_dst.set_dims_mapping(index_dims_mapping_dst);
+
+  std::vector<int64_t> x_grad_dims_mapping(x_dims_mapping_src);
+  for (int i = 0; i < x_ndim; ++i) {
+    x_grad_dims_mapping[i] = out_grad_dims_mapping_dst[i];
+  }
+
+  TensorDistAttr x_grad_dist_attr(x_dist_attr_src);
+  x_grad_dist_attr.set_dims_mapping(x_grad_dims_mapping);
+
+  return {{x_dist_attr_src, index_dist_attr_dst, out_grad_dist_attr_dst},
+          {x_grad_dist_attr}};
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/gather.h b/paddle/phi/infermeta/spmd_rules/gather.h
index c3a12941cdb19..7dd829094ca57 100644
--- a/paddle/phi/infermeta/spmd_rules/gather.h
+++ b/paddle/phi/infermeta/spmd_rules/gather.h
@@ -40,5 +40,10 @@ SpmdInfo GatherInferSpmdReverseDynamic(const DistMetaTensor& x,
                                        const DistMetaTensor& out,
                                        const Scalar& axis);
 
+SpmdInfo GatherGradInferSpmd(const DistMetaTensor& x,
+                             const DistMetaTensor& index,
+                             const DistMetaTensor& out_grad,
+                             const Scalar& axis);
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/scatter.cc b/paddle/phi/infermeta/spmd_rules/scatter.cc
index ae29d5f059ba0..6a31318045e16 100644
--- a/paddle/phi/infermeta/spmd_rules/scatter.cc
+++ b/paddle/phi/infermeta/spmd_rules/scatter.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/gather.h"
 #include "paddle/phi/infermeta/spmd_rules/spmd_rule_macro_define.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
@@ -166,5 +167,41 @@ SpmdInfo ScatterInferSpmdReverse(const DistMetaTensor& x,
           {out_dist_attr_dst}};
 }
 
+SpmdInfo ScatterGradInferSpmd(const DistMetaTensor& index,
+                              const DistMetaTensor& updates,
+                              const DistMetaTensor& out_grad,
+                              bool overwrite) {
+  EXTRACT_SHAPE_AND_DIST_ATTR(index);
+  EXTRACT_SHAPE_AND_DIST_ATTR(updates);
+  EXTRACT_SHAPE_AND_DIST_ATTR(out_grad);
+
+  // the batch axis of index, updates, out_grad must be replicated
+  std::vector<int64_t> index_dims_mapping(index_dims_mapping_src);
+  index_dims_mapping[0] = -1;
+  std::vector<int64_t> out_grad_dims_mapping(out_grad_dims_mapping_src);
+  out_grad_dims_mapping[0] = -1;
+
+  TensorDistAttr index_dist_attr_dst =
+      CopyTensorDistAttrForOutput(index_dist_attr_src);
+  index_dist_attr_dst.set_dims_mapping(index_dims_mapping);
+  TensorDistAttr out_grad_dist_attr_dst =
+      CopyTensorDistAttrForOutput(out_grad_dist_attr_src);
+  out_grad_dist_attr_dst.set_dims_mapping(out_grad_dims_mapping);
+
+  TensorDistAttr x_grad_dist_attr(out_grad_dist_attr_src);
+  std::vector<int64_t> x_dims_mapping(out_grad_dims_mapping);
+  x_grad_dist_attr.set_dims_mapping(x_dims_mapping);
+
+  DistMetaTensor out_grad_dst(out_grad.dims(), out_grad_dist_attr_dst);
+  DistMetaTensor index_dst(index.dims(), index_dist_attr_dst);
+
+  SpmdInfo spmd_info = GatherInferSpmdBase(out_grad_dst, index_dst, 0);
+  TensorDistAttr updates_grad_dist_attr =
+      PADDLE_GET_CONST(TensorDistAttr, spmd_info.second[0]);
+
+  return {{index_dist_attr_dst, updates_dist_attr_src, out_grad_dist_attr_dst},
+          {x_grad_dist_attr, updates_grad_dist_attr}};
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/scatter.h b/paddle/phi/infermeta/spmd_rules/scatter.h
index f19bc78261fc7..f074ba998bdac 100644
--- a/paddle/phi/infermeta/spmd_rules/scatter.h
+++ b/paddle/phi/infermeta/spmd_rules/scatter.h
@@ -33,5 +33,9 @@ SpmdInfo ScatterInferSpmdReverse(const DistMetaTensor& x,
                                  const DistMetaTensor& out,
                                  bool overwrite);
 
+SpmdInfo ScatterGradInferSpmd(const DistMetaTensor& index,
+                              const DistMetaTensor& updates,
+                              const DistMetaTensor& out_grad,
+                              bool overwrite);
 }  // namespace distributed
 }  // namespace phi
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index 49544cb508c7c..fdfe4becb62ad 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/common/scalar.h"
 #include "test/cpp/auto_parallel/spmd_rule_test_util.h"
 
 namespace paddle {
@@ -1653,6 +1654,147 @@ TEST(UnsqueezeGradInferSpmd, Ctor) {
       PADDLE_GET_CONST(TensorDistAttr, spmdinfo.second[0]).is_partial(), false);
 }
 
+TEST(ScatterGradInferSpmd, Ctor) {
+  std::vector<int64_t> index_shape = {16};
+  std::vector<int64_t> updates_shape = {32, 32, 48};
+  std::vector<int64_t> out_grad_shape = {64, 32, 48};
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  TensorDistAttr index_dist_attr = TensorDistAttr();
+  index_dist_attr.set_process_mesh(process_mesh);
+  TensorDistAttr updates_dist_attr = TensorDistAttr();
+  updates_dist_attr.set_process_mesh(process_mesh);
+  TensorDistAttr out_grad_dist_attr = TensorDistAttr();
+  out_grad_dist_attr.set_process_mesh(process_mesh);
+
+  // [0], [-1, -1, 1], [0, -1, 1] -->
+  // inputs: [-1], [-1, -1, 1], [-1, -1, 1]
+  // x_grad: [-1, -1, 1], updates_grad: [-1, -1, 1]
+  index_dist_attr.set_dims_mapping({0});
+  updates_dist_attr.set_dims_mapping({-1, -1, 1});
+  out_grad_dist_attr.set_dims_mapping({0, -1, 1});
+  phi::distributed::DistMetaTensor index(phi::make_ddim(index_shape),
+                                         index_dist_attr);
+  phi::distributed::DistMetaTensor updates(phi::make_ddim(updates_shape),
+                                           updates_dist_attr);
+  phi::distributed::DistMetaTensor out_grad(phi::make_ddim(out_grad_shape),
+                                            out_grad_dist_attr);
+  auto spmdinfo = ScatterGradInferSpmd(index, updates, out_grad, false);
+  EXPECT_EQ(spmdinfo.first.size(), 3UL);
+  EXPECT_EQ(spmdinfo.second.size(), 2UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), std::vector<int64_t>({-1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]),
+            std::vector<int64_t>({-1, -1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]),
+            std::vector<int64_t>({-1, -1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({-1, -1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[1]),
+            std::vector<int64_t>({-1, -1, 1}));
+
+  // [0], [0, -1, 1], [-1, 0, 1] -->
+  // inputs: [-1], [0, -1, 1], [-1, 0, 1]
+  // x_grad: [-1, 0, 1], updates_grad: [-1, 0, 1]
+  index_dist_attr.set_dims_mapping({0});
+  updates_dist_attr.set_dims_mapping({0, -1, 1});
+  out_grad_dist_attr.set_dims_mapping({-1, 0, 1});
+  index = phi::distributed::DistMetaTensor(phi::make_ddim(index_shape),
+                                           index_dist_attr);
+  updates = phi::distributed::DistMetaTensor(phi::make_ddim(updates_shape),
+                                             updates_dist_attr);
+  out_grad = phi::distributed::DistMetaTensor(phi::make_ddim(out_grad_shape),
+                                              out_grad_dist_attr);
+  spmdinfo = ScatterGradInferSpmd(index, updates, out_grad, false);
+  EXPECT_EQ(spmdinfo.first.size(), 3UL);
+  EXPECT_EQ(spmdinfo.second.size(), 2UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]), std::vector<int64_t>({-1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]),
+            std::vector<int64_t>({0, -1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]),
+            std::vector<int64_t>({-1, 0, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({-1, 0, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[1]),
+            std::vector<int64_t>({-1, 0, 1}));
+}
+
+TEST(GatherGradInferSpmd, Ctor) {
+  std::vector<int64_t> x_shape = {64, 32, 48};
+  std::vector<int64_t> index_shape = {16};
+  std::vector<int64_t> out_grad_shape = {16, 32, 48};
+  phi::Scalar axis(0);
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  TensorDistAttr x_dist_attr = TensorDistAttr();
+  x_dist_attr.set_process_mesh(process_mesh);
+  TensorDistAttr index_dist_attr = TensorDistAttr();
+  index_dist_attr.set_process_mesh(process_mesh);
+  TensorDistAttr out_grad_dist_attr = TensorDistAttr();
+  out_grad_dist_attr.set_process_mesh(process_mesh);
+
+  // axis = 0
+  // [0, -1, 1], [0], [0, -1, 1] -->
+  // inputs: [0, -1, 1], [-1], [-1, -1, 1]
+  // x_grad: [-1, -1, 1]
+  axis = 0;
+  x_dist_attr.set_dims_mapping({0, -1, 1});
+  index_dist_attr.set_dims_mapping({0});
+  out_grad_dist_attr.set_dims_mapping({0, -1, 1});
+  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor index(phi::make_ddim(index_shape),
+                                         index_dist_attr);
+  phi::distributed::DistMetaTensor out_grad(phi::make_ddim(out_grad_shape),
+                                            out_grad_dist_attr);
+  auto spmdinfo = GatherGradInferSpmd(x, index, out_grad, axis);
+  EXPECT_EQ(spmdinfo.first.size(), 3UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]),
+            std::vector<int64_t>({0, -1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]), std::vector<int64_t>({-1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]),
+            std::vector<int64_t>({-1, -1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({-1, -1, 1}));
+
+  // 0-d tensor
+  // axis = 1
+  // [0, -1, 1], [-1], [0, 1] -->
+  // inputs: [0, -1, 1], [-1], [0, 1]
+  // x_grad: [0, -1, 1]
+  axis = 1;
+  index_shape = {};
+  out_grad_shape = {64, 48};
+  x_dist_attr.set_dims_mapping({0, -1, 1});
+  index_dist_attr.set_dims_mapping({-1});
+  out_grad_dist_attr.set_dims_mapping({0, 1});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  index = phi::distributed::DistMetaTensor(phi::make_ddim(index_shape),
+                                           index_dist_attr);
+  out_grad = phi::distributed::DistMetaTensor(phi::make_ddim(out_grad_shape),
+                                              out_grad_dist_attr);
+  spmdinfo = GatherGradInferSpmd(x, index, out_grad, axis);
+  EXPECT_EQ(spmdinfo.first.size(), 3UL);
+  EXPECT_EQ(spmdinfo.second.size(), 1UL);
+
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[0]),
+            std::vector<int64_t>({0, -1, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[1]), std::vector<int64_t>({-1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.first[2]), std::vector<int64_t>({0, 1}));
+  EXPECT_EQ(get_dims_mapping(spmdinfo.second[0]),
+            std::vector<int64_t>({0, -1, 1}));
+}
+
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle

From a96ef3315aa0744ffd17be8ebc0f12e442aba8fb Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:40:37 +0800
Subject: [PATCH 067/114] [PIR] [DyShape] Fix unit test --
 test_unary_op_infer_sym_shape (#62530)

* fix ut
---
 test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
index a740b47542ccf..e43d6343a94b5 100644
--- a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
+++ b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
@@ -285,8 +285,8 @@ def prepare_data(self):
             [
                 'shape[6, 6], data[NULL]',
                 'shape[7, 7], data[NULL]',
-                'shape[S0, S1, Add(0, S2), Add(0, S2)], data[NULL]',
-                'shape[Add(1, S2), Add(1, S2), S0, S1], data[NULL]',
+                'shape[S0, S1, S2, S2], data[NULL]',
+                'shape[Add(S2, 1), Add(S2, 1), S0, S1], data[NULL]',
             ]
         ]
 

From 7fd1722f21d75905951d15ffc46844fbedd86df7 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:41:05 +0800
Subject: [PATCH 068/114]  Fix MemEvenRecorder MemEventRecorder (#62537)

---
 paddle/fluid/platform/profiler.cc       | 124 ++++++++++++------------
 paddle/fluid/platform/profiler.h        |  12 +--
 paddle/fluid/platform/profiler_helper.h |   4 +-
 3 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 2630b36d0e8ad..b0f8f329dde4f 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -56,7 +56,7 @@ std::mutex phi::ProfilerHelper::g_all_mem_event_lists_mutex;
 namespace paddle {
 namespace platform {
 
-MemEvenRecorder MemEvenRecorder::recorder;
+MemEventRecorder MemEventRecorder::recorder;
 
 RecordInstantEvent::RecordInstantEvent(const char *name,
                                        TracerEventType type,
@@ -214,14 +214,14 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
             RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
       }
     }
-    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
-                                                        place,
-                                                        size,
-                                                        type,
-                                                        current_allocated,
-                                                        current_reserved,
-                                                        peak_allocated,
-                                                        peak_reserved);
+    platform::MemEventRecorder::Instance().PushMemRecord(ptr,
+                                                         place,
+                                                         size,
+                                                         type,
+                                                         current_allocated,
+                                                         current_reserved,
+                                                         peak_allocated,
+                                                         peak_reserved);
   } else if (type == TracerMemEventType::ReservedAllocate) {
     uint64_t current_reserved = 0;
     uint64_t peak_reserved = 0;
@@ -297,14 +297,14 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
             RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
       }
     }
-    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
-                                                        place,
-                                                        size,
-                                                        type,
-                                                        current_allocated,
-                                                        current_reserved,
-                                                        peak_allocated,
-                                                        peak_reserved);
+    platform::MemEventRecorder::Instance().PushMemRecord(ptr,
+                                                         place,
+                                                         size,
+                                                         type,
+                                                         current_allocated,
+                                                         current_reserved,
+                                                         peak_allocated,
+                                                         peak_reserved);
   } else if (type == TracerMemEventType::Free) {
     uint64_t current_allocated = 0;
     uint64_t peak_allocated = 0;
@@ -380,14 +380,14 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
             RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3];
       }
     }
-    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
-                                                       place,
-                                                       size,
-                                                       type,
-                                                       current_allocated,
-                                                       current_reserved,
-                                                       peak_allocated,
-                                                       peak_reserved);
+    platform::MemEventRecorder::Instance().PopMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
   } else if (type == TracerMemEventType::ReservedFree) {
     uint64_t current_reserved = 0;
     uint64_t peak_reserved = 0;
@@ -463,20 +463,20 @@ RecordMemEvent::RecordMemEvent(const void *ptr,
             RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2];
       }
     }
-    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
-                                                       place,
-                                                       size,
-                                                       type,
-                                                       current_allocated,
-                                                       current_reserved,
-                                                       peak_allocated,
-                                                       peak_reserved);
+    platform::MemEventRecorder::Instance().PopMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
   }
 }
 
-void MemEvenRecorder::PushMemRecord(const void *ptr,
-                                    const Place &place,
-                                    size_t size) {
+void MemEventRecorder::PushMemRecord(const void *ptr,
+                                     const Place &place,
+                                     size_t size) {
   if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) {
     return;
   }
@@ -487,17 +487,17 @@ void MemEvenRecorder::PushMemRecord(const void *ptr,
                     platform::errors::InvalidArgument(
                         "The Place can't exist in the stage of PushMemRecord"));
   events.emplace(
-      ptr, std::make_unique<MemEvenRecorder::RecordMemEvent>(place, size));
+      ptr, std::make_unique<MemEventRecorder::RecordMemEvent>(place, size));
 }
 
-void MemEvenRecorder::PushMemRecord(const void *ptr,
-                                    const Place &place,
-                                    size_t size,
-                                    TracerMemEventType type,
-                                    uint64_t current_allocated,
-                                    uint64_t current_reserved,
-                                    uint64_t peak_allocated,
-                                    uint64_t peak_reserved) {
+void MemEventRecorder::PushMemRecord(const void *ptr,
+                                     const Place &place,
+                                     size_t size,
+                                     TracerMemEventType type,
+                                     uint64_t current_allocated,
+                                     uint64_t current_reserved,
+                                     uint64_t peak_allocated,
+                                     uint64_t peak_reserved) {
   std::lock_guard<std::mutex> guard(mtx_);
   if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
     HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
@@ -523,10 +523,10 @@ void MemEvenRecorder::PushMemRecord(const void *ptr,
                     platform::errors::InvalidArgument(
                         "The Place can't exist in the stage of PushMemRecord"));
   events.emplace(
-      ptr, std::make_unique<MemEvenRecorder::RecordMemEvent>(place, size));
+      ptr, std::make_unique<MemEventRecorder::RecordMemEvent>(place, size));
 }
 
-void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
+void MemEventRecorder::PopMemRecord(const void *ptr, const Place &place) {
   if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) {
     return;
   }
@@ -539,14 +539,14 @@ void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
   }
 }
 
-void MemEvenRecorder::PopMemRecord(const void *ptr,
-                                   const Place &place,
-                                   size_t size,
-                                   TracerMemEventType type,
-                                   uint64_t current_allocated,
-                                   uint64_t current_reserved,
-                                   uint64_t peak_allocated,
-                                   uint64_t peak_reserved) {
+void MemEventRecorder::PopMemRecord(const void *ptr,
+                                    const Place &place,
+                                    size_t size,
+                                    TracerMemEventType type,
+                                    uint64_t current_allocated,
+                                    uint64_t current_reserved,
+                                    uint64_t peak_allocated,
+                                    uint64_t peak_reserved) {
   std::lock_guard<std::mutex> guard(mtx_);
   if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
     HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
@@ -574,13 +574,13 @@ void MemEvenRecorder::PopMemRecord(const void *ptr,
   }
 }
 
-void MemEvenRecorder::Flush() {
+void MemEventRecorder::Flush() {
   std::lock_guard<std::mutex> guard(mtx_);
   address_memevent_.clear();
 }
 
-MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
-                                                size_t bytes)
+MemEventRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
+                                                 size_t bytes)
     : place_(place),
       bytes_(bytes),
       start_ns_(PosixInNsec()),
@@ -588,7 +588,7 @@ MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
   PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
 }
 
-MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {  // NOLINT
+MemEventRecorder::RecordMemEvent::~RecordMemEvent() {  // NOLINT
   phi::DeviceTracer *tracer = phi::GetDeviceTracer();
   end_ns_ = PosixInNsec();
 
@@ -701,7 +701,7 @@ void EnableProfiler(ProfilerState state) {
 void ResetProfiler() {
   SynchronizeAllDevice();
   phi::GetDeviceTracer()->Reset();
-  MemEvenRecorder::Instance().Flush();
+  MemEventRecorder::Instance().Flush();
   std::lock_guard<std::mutex> guard(
       phi::ProfilerHelper::g_all_event_lists_mutex);
   for (auto &all_event_list : phi::ProfilerHelper::g_all_event_lists) {
@@ -720,7 +720,7 @@ void DisableProfiler(EventSortingKey sorted_key,
                      const std::string &profile_path) {
   SynchronizeAllDevice();
   auto thr_events = DockHostEventRecorderHostPart();
-  MemEvenRecorder::Instance().Flush();
+  MemEventRecorder::Instance().Flush();
 
   std::lock_guard<std::mutex> l(profiler_mu);
   if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
@@ -755,7 +755,7 @@ void CompleteProfilerEvents(phi::proto::Profile *tracer_profile,
                             std::vector<std::vector<MemEvent>> *mem_events) {
   SynchronizeAllDevice();
   auto thr_events = DockHostEventRecorderHostPart();
-  MemEvenRecorder::Instance().Flush();
+  MemEventRecorder::Instance().Flush();
   std::lock_guard<std::mutex> l(profiler_mu);
   if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
   // Mark the profiling stop.
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 4d6bc9cc242d4..27c2bc8f77f7d 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,7 +69,7 @@ enum class EventSortingKey {
   kGPUTime
 };
 
-struct MemoryProfierReport {
+struct MemoryProfilerReport {
   size_t alloc_times{0};
   size_t alloc_size{0};
   size_t free_times{0};
@@ -101,7 +101,7 @@ struct OverHead {
   std::vector<EventItem> sub_memcpy_items;
 };
 
-struct MemEvenRecorder {
+struct MemEventRecorder {
  public:
   void PushMemRecord(const void* ptr, const Place& place, size_t size);
   void PopMemRecord(const void* ptr, const Place& place);
@@ -122,7 +122,7 @@ struct MemEvenRecorder {
                     uint64_t peak_allocated,
                     uint64_t peak_reserved);
   void Flush();
-  static MemEvenRecorder& Instance() { return recorder; }
+  static MemEventRecorder& Instance() { return recorder; }
 
  private:
   struct RecordMemEvent {
@@ -137,13 +137,13 @@ struct MemEvenRecorder {
     std::string free_in_;
   };
 
-  static MemEvenRecorder recorder;
+  static MemEventRecorder recorder;
   std::map<Place,
            std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
       address_memevent_;
   std::mutex mtx_;
-  MemEvenRecorder() {}
-  DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
+  MemEventRecorder() {}
+  DISABLE_COPY_AND_ASSIGN(MemEventRecorder);
 };
 
 struct RecordBlock {
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index f79b801f1a095..634d670c575bb 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -132,7 +132,7 @@ static double ToMegaBytes(size_t bytes) {
 
 // Print results
 void PrintMemProfiler(
-    const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
+    const std::map<Place, std::unordered_map<std::string, MemoryProfilerReport>>
         &annotation_report,
     const size_t name_width,
     const size_t data_width) {
@@ -200,7 +200,7 @@ void PrintMemProfiler(
 void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
   if (phi::ProfilerHelper::g_state == ProfilerState::kDisabled) return;
   // place, annotation, alloc times,  alloc size
-  std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
+  std::map<Place, std::unordered_map<std::string, MemoryProfilerReport>>
       annotation_report;
 
   for (auto &tmp : events) {

From 536a85ece8ccbacdafe452c0b6ce01c0e5ab7234 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:41:49 +0800
Subject: [PATCH 069/114] Fix DECLEAR_ DECLARE_ (#62514)

---
 paddle/phi/kernels/logical_kernel.h           | 10 ++--
 paddle/phi/kernels/xpu/bmm_grad_kernel.cc     | 10 ++--
 paddle/phi/kernels/xpu/bmm_kernel.cc          | 10 ++--
 paddle/phi/kernels/xpu/bmm_xpu_utils.h        |  6 +--
 paddle/phi/kernels/xpu/conv_grad_kernel.cc    | 16 +++---
 paddle/phi/kernels/xpu/conv_kernel.cc         | 16 +++---
 .../kernels/xpu/conv_transpose_grad_kernel.cc |  6 +--
 .../phi/kernels/xpu/conv_transpose_kernel.cc  |  8 +--
 paddle/phi/kernels/xpu/xpu_api_wrapper.h      | 50 +++++++++----------
 9 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/paddle/phi/kernels/logical_kernel.h b/paddle/phi/kernels/logical_kernel.h
index 3ccc03a5b598a..69214ef1d4532 100644
--- a/paddle/phi/kernels/logical_kernel.h
+++ b/paddle/phi/kernels/logical_kernel.h
@@ -18,17 +18,17 @@ limitations under the License. */
 
 namespace phi {
 
-#define DECLEAR_LOGICAL_BINARY_KERNEL(type)          \
+#define DECLARE_LOGICAL_BINARY_KERNEL(type)          \
   template <typename T, typename Context>            \
   void Logical##type##Kernel(const Context& dev_ctx, \
                              const DenseTensor& x,   \
                              const DenseTensor& y,   \
                              DenseTensor* out);
 
-DECLEAR_LOGICAL_BINARY_KERNEL(And)
-DECLEAR_LOGICAL_BINARY_KERNEL(Or)
-DECLEAR_LOGICAL_BINARY_KERNEL(Xor)
-#undef DECLEAR_LOGICAL_BINARY_KERNEL
+DECLARE_LOGICAL_BINARY_KERNEL(And)
+DECLARE_LOGICAL_BINARY_KERNEL(Or)
+DECLARE_LOGICAL_BINARY_KERNEL(Xor)
+#undef DECLARE_LOGICAL_BINARY_KERNEL
 
 template <typename T, typename Context>
 void LogicalNotKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/xpu/bmm_grad_kernel.cc b/paddle/phi/kernels/xpu/bmm_grad_kernel.cc
index 751608552482c..e2fdbb610d2a2 100644
--- a/paddle/phi/kernels/xpu/bmm_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/bmm_grad_kernel.cc
@@ -28,14 +28,14 @@ void MatMul(const Context& dev_ctx,
   using XPUType = typename XPUTypeTrait<T>::Type;
   dev_ctx.template Alloc<T>(out);
   xpu::Context* xpu_ctx = dev_ctx.x_context();
-  int fccal_type = FCCalcType<XPUType>();
-  if (fccal_type == XPUFCCalcType::FC_INT32) {
+  int fc_calc_type = FCCalcType<XPUType>();
+  if (fc_calc_type == XPUFCCalcType::FC_INT32) {
     MatMulXPUFunction<T, int32_t>(a, b, out, trans_a, trans_b, xpu_ctx);
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT) {
     MatMulXPUFunction<T, float>(a, b, out, trans_a, trans_b, xpu_ctx);
-  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_INT32_WITH_LL) {
     MatMulXPUFunction<T, int_with_ll_t>(a, b, out, trans_a, trans_b, xpu_ctx);
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT16) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT16) {
     MatMulXPUFunction<T, float16>(a, b, out, trans_a, trans_b, xpu_ctx);
   } else {
     MatMulXPUFunction<T, int16_t>(a, b, out, trans_a, trans_b, xpu_ctx);
diff --git a/paddle/phi/kernels/xpu/bmm_kernel.cc b/paddle/phi/kernels/xpu/bmm_kernel.cc
index 160fabe1ec750..3ce7d6578dfad 100644
--- a/paddle/phi/kernels/xpu/bmm_kernel.cc
+++ b/paddle/phi/kernels/xpu/bmm_kernel.cc
@@ -63,14 +63,14 @@ void BmmKernel(const Context& dev_ctx,
           y_dims[1]));
 
   xpu::Context* xpu_ctx = dev_ctx.x_context();
-  int fccal_type = FCCalcType<XPUType>();
-  if (fccal_type == XPUFCCalcType::FC_INT32) {
+  int fc_calc_type = FCCalcType<XPUType>();
+  if (fc_calc_type == XPUFCCalcType::FC_INT32) {
     MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, xpu_ctx);
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT) {
     MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, xpu_ctx);
-  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_INT32_WITH_LL) {
     MatMulXPUFunction<T, int_with_ll_t>(x, y, out, trans_x, trans_y, xpu_ctx);
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT16) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT16) {
     MatMulXPUFunction<T, float16>(x, y, out, trans_x, trans_y, xpu_ctx);
   } else {
     MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, xpu_ctx);
diff --git a/paddle/phi/kernels/xpu/bmm_xpu_utils.h b/paddle/phi/kernels/xpu/bmm_xpu_utils.h
index 90d5b51973957..c7c6bfe2bed64 100644
--- a/paddle/phi/kernels/xpu/bmm_xpu_utils.h
+++ b/paddle/phi/kernels/xpu/bmm_xpu_utils.h
@@ -40,7 +40,7 @@ static void MatMulXPUFunction(const DenseTensor& x,
   int k = mat_dim_a.width_;
   int batch_size = mat_dim_a.batch_size_;
   // batch matmul
-  int fccal_type = FCCalcType<XPUType>();
+  int fc_calc_type = FCCalcType<XPUType>();
   decltype(&xblas_fc_batch_wrapper<XPUType, int16_t, float>)
       xblas_fc_batch_api_list[6] = {
           &xblas_fc_batch_wrapper<XPUType, int16_t, float>,
@@ -51,8 +51,8 @@ static void MatMulXPUFunction(const DenseTensor& x,
           &xblas_fc_batch_wrapper<XPUType, XPUTypeFP16, float>,
       };
 
-  auto xblas_fc_batch_api = xblas_fc_batch_api_list[fccal_type];
-  if (fccal_type == XPUFCCalcType::FC_FLOAT16 &&
+  auto xblas_fc_batch_api = xblas_fc_batch_api_list[fc_calc_type];
+  if (fc_calc_type == XPUFCCalcType::FC_FLOAT16 &&
       std::getenv("XPU_PADDLE_FC_FLOAT16") != nullptr) {
     xblas_fc_batch_api =
         &xblas_fc_batch_wrapper<XPUType, XPUTypeFP16, XPUTypeFP16>;
diff --git a/paddle/phi/kernels/xpu/conv_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_grad_kernel.cc
index 356f77a850b43..cf5162a71e108 100644
--- a/paddle/phi/kernels/xpu/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_grad_kernel.cc
@@ -107,8 +107,8 @@ void ConvGradKernel(const Context& dev_ctx,
       filter_grad_data_ptr = filter_grad_data_tmp;
     }
   }
-  int fccal_type = FCCalcType<XPUType>();
-  if (fccal_type == XPUFCCalcType::FC_INT32) {
+  int fc_calc_type = FCCalcType<XPUType>();
+  if (fc_calc_type == XPUFCCalcType::FC_INT32) {
     int r =
         xpu::conv2d_grad<XPUType, XPUType, XPUType, int>(dev_ctx.x_context(),
                                                          input_data,
@@ -134,7 +134,7 @@ void ConvGradKernel(const Context& dev_ctx,
                                                          is_nchw);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_grad");
 
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT) {
     int r =
         xpu::conv2d_grad<XPUType, XPUType, XPUType, float>(dev_ctx.x_context(),
                                                            input_data,
@@ -160,7 +160,7 @@ void ConvGradKernel(const Context& dev_ctx,
                                                            is_nchw);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_grad");
 
-  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_INT32_WITH_LL) {
     int r = xpu::conv2d_grad<XPUType, XPUType, XPUType, int_with_ll_t>(
         dev_ctx.x_context(),
         input_data,
@@ -334,8 +334,8 @@ void Conv3DGradKernel(const Context& dev_ctx,
       filter_grad_data_ptr = filter_grad_data_tmp;
     }
   }
-  int fccal_type = FCCalcType<XPUType>();
-  if (fccal_type == XPUFCCalcType::FC_INT32) {
+  int fc_calc_type = FCCalcType<XPUType>();
+  if (fc_calc_type == XPUFCCalcType::FC_INT32) {
     int r =
         xpu::conv3d_grad<XPUType, XPUType, XPUType, int>(dev_ctx.x_context(),
                                                          input_data,
@@ -361,7 +361,7 @@ void Conv3DGradKernel(const Context& dev_ctx,
                                                          nullptr,
                                                          is_ncdhw);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv3d_grad");
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT) {
     int r =
         xpu::conv3d_grad<XPUType, XPUType, XPUType, float>(dev_ctx.x_context(),
                                                            input_data,
@@ -387,7 +387,7 @@ void Conv3DGradKernel(const Context& dev_ctx,
                                                            nullptr,
                                                            is_ncdhw);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv3d_grad");
-  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_INT32_WITH_LL) {
     int r = xpu::conv3d_grad<XPUType, XPUType, XPUType, int_with_ll_t>(
         dev_ctx.x_context(),
         input_data,
diff --git a/paddle/phi/kernels/xpu/conv_kernel.cc b/paddle/phi/kernels/xpu/conv_kernel.cc
index 02e4bbcae1180..c0cfe2db83034 100644
--- a/paddle/phi/kernels/xpu/conv_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_kernel.cc
@@ -89,8 +89,8 @@ void ConvKernel(const Context& dev_ctx,
     filter_data_ptr = reinterpret_cast<const XPUType*>(filter_data_tmp);
   }
 
-  int fccal_type = FCCalcType<XPUType>();
-  if (fccal_type == XPUFCCalcType::FC_INT32) {
+  int fc_calc_type = FCCalcType<XPUType>();
+  if (fc_calc_type == XPUFCCalcType::FC_INT32) {
     int r = xpu::conv2d<XPUType, XPUType, XPUType, int>(dev_ctx.x_context(),
                                                         input_data,
                                                         filter_data_ptr,
@@ -110,7 +110,7 @@ void ConvKernel(const Context& dev_ctx,
                                                         nullptr,
                                                         is_nchw);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d");
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT) {
     int r = xpu::conv2d<XPUType, XPUType, XPUType, float>(dev_ctx.x_context(),
                                                           input_data,
                                                           filter_data_ptr,
@@ -130,7 +130,7 @@ void ConvKernel(const Context& dev_ctx,
                                                           nullptr,
                                                           is_nchw);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d");
-  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_INT32_WITH_LL) {
     int r = xpu::conv2d<XPUType, XPUType, XPUType, int_with_ll_t>(
         dev_ctx.x_context(),
         input_data,
@@ -261,8 +261,8 @@ void Conv3DKernel(const Context& dev_ctx,
     filter_data_ptr = reinterpret_cast<const XPUType*>(filter_data_tmp);
   }
 
-  int fccal_type = FCCalcType<XPUType>();
-  if (fccal_type == XPUFCCalcType::FC_INT32) {
+  int fc_calc_type = FCCalcType<XPUType>();
+  if (fc_calc_type == XPUFCCalcType::FC_INT32) {
     int r = xpu::conv3d<XPUType, XPUType, XPUType, int>(dev_ctx.x_context(),
                                                         input_data,
                                                         filter_data_ptr,
@@ -283,7 +283,7 @@ void Conv3DKernel(const Context& dev_ctx,
                                                         nullptr,
                                                         is_ncdhw);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv3d");
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT) {
     int r = xpu::conv3d<XPUType, XPUType, XPUType, float>(dev_ctx.x_context(),
                                                           input_data,
                                                           filter_data_ptr,
@@ -305,7 +305,7 @@ void Conv3DKernel(const Context& dev_ctx,
                                                           is_ncdhw);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv3d");
 
-  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_INT32_WITH_LL) {
     int r = xpu::conv3d<XPUType, XPUType, XPUType, int_with_ll_t>(
         dev_ctx.x_context(),
         input_data,
diff --git a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
index 296e02c28016d..5c911475af25f 100644
--- a/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_transpose_grad_kernel.cc
@@ -69,9 +69,9 @@ void Conv2dTransposeGradKernel(const Context& ctx,
   if (dfilter) {
     ctx.template Alloc<T>(dfilter);
   }
-  int fccal_type = FCCalcType<T>();
-  if (fccal_type == XPUFCCalcType::FC_INT32 ||
-      fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
+  int fc_calc_type = FCCalcType<T>();
+  if (fc_calc_type == XPUFCCalcType::FC_INT32 ||
+      fc_calc_type == XPUFCCalcType::FC_INT32_WITH_LL) {
     // xpu api do not support int31 quantization now.
     int r = xpu::conv2d_transpose_grad<float, float, float, int_with_ll_t>(
         ctx.x_context(),
diff --git a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
index 8dafe67056b50..d6685c998acec 100644
--- a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
+++ b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc
@@ -76,8 +76,8 @@ void Conv2dTransposeKernel(const Context& ctx,
   const int img_xh = static_cast<int>(out->dims()[2]);
   const int img_xw = static_cast<int>(out->dims()[3]);
 
-  int fccal_type = FCCalcType<XPUType>();
-  if (fccal_type == XPUFCCalcType::FC_INT32) {
+  int fc_calc_type = FCCalcType<XPUType>();
+  if (fc_calc_type == XPUFCCalcType::FC_INT32) {
     int r = xpu::conv2d_transpose_v2<float, float, float, int32_t>(
         ctx.x_context(),
         x.data<float>(),
@@ -98,7 +98,7 @@ void Conv2dTransposeKernel(const Context& ctx,
         nullptr,
         true);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_v2");
-  } else if (fccal_type == XPUFCCalcType::FC_FLOAT) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_FLOAT) {
     int r = xpu::conv2d_transpose_v2<float, float, float, float>(
         ctx.x_context(),
         x.data<float>(),
@@ -119,7 +119,7 @@ void Conv2dTransposeKernel(const Context& ctx,
         nullptr,
         true);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_v2");
-  } else if (fccal_type == XPUFCCalcType::FC_INT32_WITH_LL) {
+  } else if (fc_calc_type == XPUFCCalcType::FC_INT32_WITH_LL) {
     if (output_size.size()) {
       VLOG(4) << "int_with_ll quantization is not supported when output_size "
                  "is specified, "
diff --git a/paddle/phi/kernels/xpu/xpu_api_wrapper.h b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
index aa64a15ba8527..c6560622eaaf6 100644
--- a/paddle/phi/kernels/xpu/xpu_api_wrapper.h
+++ b/paddle/phi/kernels/xpu/xpu_api_wrapper.h
@@ -311,7 +311,7 @@ static void xblas_fc_wrapper(xpu::Context* ctx,
   }
 }
 
-#define DECLEAR_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUType, FCT)          \
+#define DECLARE_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUType, FCT)          \
   template <>                                                       \
   void xblas_fc_wrapper<XPUType, FCT>(xpu::Context * ctx,           \
                                       const XPUType* x,             \
@@ -340,12 +340,12 @@ static void xblas_fc_wrapper(xpu::Context* ctx,
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "xblas_fc_wrapper");             \
   }
 
-DECLEAR_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeBF16, int_with_ll_t)
-DECLEAR_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeBF16, int16_t)
-DECLEAR_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeBF16, int32_t)
-DECLEAR_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeBF16, XPUTypeFP16)
-DECLEAR_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeFP16, int32_t)
-DECLEAR_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeFP16, tfloat32)
+DECLARE_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeBF16, int_with_ll_t)
+DECLARE_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeBF16, int16_t)
+DECLARE_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeBF16, int32_t)
+DECLARE_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeBF16, XPUTypeFP16)
+DECLARE_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeFP16, int32_t)
+DECLARE_UNSUPPORTED_XBLAS_FC_WRAPPER(XPUTypeFP16, tfloat32)
 
 template <typename XPUType, typename FCT, typename TGEMM_OUT>
 static void xblas_fc_batch_wrapper(xpu::Context* xpu_ctx,
@@ -386,7 +386,7 @@ static void xblas_fc_batch_wrapper(xpu::Context* xpu_ctx,
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "xblas_fc_batch_wrapper");
 }
 
-#define DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUType, FCT, TGEMM_OUT) \
+#define DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUType, FCT, TGEMM_OUT) \
   template <>                                                               \
   void xblas_fc_batch_wrapper<XPUType, FCT, TGEMM_OUT>(                     \
       xpu::Context * xpu_ctx,                                               \
@@ -410,23 +410,23 @@ static void xblas_fc_batch_wrapper(xpu::Context* xpu_ctx,
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "xblas_fc_batched");                     \
   }
 
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16,
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16,
                                            int_with_ll_t,
                                            XPUTypeFP16)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeFP16, tfloat32, XPUTypeFP16)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, float, XPUTypeFP16)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16,
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeFP16, tfloat32, XPUTypeFP16)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, float, XPUTypeFP16)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16,
                                            XPUTypeFP16,
                                            XPUTypeFP16)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int32_t, XPUTypeFP16)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int16_t, XPUTypeFP16)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeFP16, int32_t, XPUTypeFP16)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int_with_ll_t, float)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, XPUTypeFP16, float)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeFP16, tfloat32, float)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int32_t, float)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int16_t, float)
-DECLEAR_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeFP16, int32_t, float)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int32_t, XPUTypeFP16)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int16_t, XPUTypeFP16)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeFP16, int32_t, XPUTypeFP16)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int_with_ll_t, float)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, XPUTypeFP16, float)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeFP16, tfloat32, float)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int32_t, float)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeBF16, int16_t, float)
+DECLARE_UNSUPPORTED_XBLAS_FC_BATCH_WRAPPER(XPUTypeFP16, int32_t, float)
 
 template <typename T>
 static void MatMulXPUFunction(
@@ -439,7 +439,7 @@ static void MatMulXPUFunction(
     bool is_grad = false,
     xpu::Activation_t act = xpu::Activation_t::LINEAR) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  int fccal_type = FCCalcType<XPUType>();
+  int fc_calc_type = FCCalcType<XPUType>();
 
   decltype(&xblas_fc_wrapper<XPUType, int16_t>) xblas_fc_api_list[6] = {
       &xblas_fc_wrapper<XPUType, int16_t>,
@@ -460,16 +460,16 @@ static void MatMulXPUFunction(
           &xblas_fc_batch_wrapper<XPUType, XPUTypeFP16, float>,
       };
 
-  auto xblas_fc_api = xblas_fc_api_list[fccal_type];
+  auto xblas_fc_api = xblas_fc_api_list[fc_calc_type];
 
   if (std::getenv("XPU_PADDLE_FC_GRAD_LOCAL") != nullptr) {
     if (is_grad) {
       xblas_fc_api = xblas_fc_api_list[2];
     }
   }
-  auto xblas_fc_batch_api = xblas_fc_batch_api_list[fccal_type];
+  auto xblas_fc_batch_api = xblas_fc_batch_api_list[fc_calc_type];
 
-  if (fccal_type == XPUFCCalcType::FC_FLOAT16 &&
+  if (fc_calc_type == XPUFCCalcType::FC_FLOAT16 &&
       std::getenv("XPU_PADDLE_FC_FLOAT16") != nullptr) {
     xblas_fc_batch_api =
         &xblas_fc_batch_wrapper<XPUType, XPUTypeFP16, XPUTypeFP16>;

From f2d1f4d35e58ff8d1157fdc35c82aa9d0d59e075 Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:42:36 +0800
Subject: [PATCH 070/114] [PIR][DynamicShape] Fix bug in InferSymbolicShape
 ElementWiseBinary (#62455)

* Fix bug in InferSymbolicShape ElementWiseBinary

* fix bug in fuse pass

* optimize error message

* fix typo

* fix more bugs
---
 ...e_shape_ops_into_generate_shape_op_pass.cc |  9 +-
 .../infer_sym_element_wise_binary.cc          | 16 +++-
 .../infer_sym_element_wise_binary.h           |  2 +
 .../infer_symbolic_shape/infer_sym_utils.h    | 16 ++++
 .../paddle_op_infer_sym.cc                    | 21 ++++-
 .../same_operands_and_result.cc               |  9 +-
 .../same_operands_and_result.h                |  2 -
 .../infer_symbolic_shape/unary_infer_sym.cc   |  7 +-
 .../pir/transforms/shape_optimization_pass.cc | 83 ++++++++++++++++---
 9 files changed, 134 insertions(+), 31 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
index 064035b8b3b19..0b0d4b4de9ebc 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
@@ -21,6 +21,7 @@
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/framework/pir/utils.h"
 #include "paddle/common/ddim.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
@@ -107,8 +108,12 @@ bool MakeGenerateShapeOpAttribute(
     std::vector<pir::Attribute>* output_dim_expr_attrs,
     GenerateShapeOp::SymbolBindings* symbol_bindings) {
   const auto& shape_or_data_dim_exprs = ShapeOrDataDimExprs4Value(output_shape);
-  CHECK(shape_or_data_dim_exprs.data().has_value());
-  const auto& out_dim_exprs = shape_or_data_dim_exprs.data().value();
+  ExprVec data_vec =
+      paddle::dialect::details::GetExprVecFromData(shape_or_data_dim_exprs);
+  // CHECK(shape_or_data_dim_exprs.data().has_value());
+  CHECK(data_vec.size());
+  // const auto& out_dim_exprs = shape_or_data_dim_exprs.data().value();
+  const auto& out_dim_exprs = data_vec;
   return MakeGenerateShapeOpAttribute(ir_context,
                                       ShapeOrDataDimExprs4Value,
                                       out_dim_exprs,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
index da8b68aefe206..f154cd8ddb5b4 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
@@ -23,7 +23,9 @@ bool InferSymbolicShapeElementWiseBinary(
   // For ElementWiseBinary ops, if the input tensor is from full op, the value
   // of fullop is useless, only the shape need doing broadcast
   bool x_from_fullop =
-      op->operand_source(0).defining_op()->isa<paddle::dialect::FullOp>();
+      op->operand_source(0).defining_op()
+          ? op->operand_source(0).defining_op()->isa<paddle::dialect::FullOp>()
+          : false;
   if (!x_from_fullop && x_shapeordata.data().has_value()) {
     shape_0 = x_shapeordata.data().value();
   } else {
@@ -34,7 +36,9 @@ bool InferSymbolicShapeElementWiseBinary(
       shape_analysis->GetShapeOrDataForValue(op->operand_source(1));
   std::vector<symbol::DimExpr> shape_1;
   bool y_from_fullop =
-      op->operand_source(1).defining_op()->isa<paddle::dialect::FullOp>();
+      op->operand_source(1).defining_op()
+          ? op->operand_source(1).defining_op()->isa<paddle::dialect::FullOp>()
+          : false;
   if (!y_from_fullop && y_shapeordata.data().has_value()) {
     shape_1 = y_shapeordata.data().value();
   } else {
@@ -224,4 +228,12 @@ bool Remainder_OpInferSymbolicShape(
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
 
+bool SubtractOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool Subtract_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
 }  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
index 65fa20c8e63e7..aaa6ebf1d5836 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
@@ -53,5 +53,7 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(NotEqual)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(NotEqual_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Remainder)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Remainder_)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Subtract)
+OP_DECLARE_INFER_SYMBOLIC_SHAPE(Subtract_)
 
 }  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
index 8c13e38b54de3..2085790abd0cb 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
@@ -75,6 +75,22 @@ std::vector<T> GetVectorAttr(const ::pir::Operation *op,
   return vec_res;
 }
 
+inline ExprVec GetExprVecFromData(const ShapeOrData &shapeordata) {
+  if (shapeordata.isa<TensorListExprs>()) {
+    ExprVec result;
+    TensorListExprs list =
+        shapeordata.dyn_cast<symbol::TensorListShapeOrDataDimExprs>();
+    for (size_t i = 0; i < list.size(); i++) {
+      for (auto expr : list[i].data().value()) {
+        result.emplace_back(expr);
+      }
+    }
+    return result;
+  } else {
+    return shapeordata.data().value();
+  }
+}
+
 std::optional<std::vector<int64_t>> VecExpr2Int64(const ExprVec &expr_vec);
 
 ExprVec VecInt642Expr(const std::vector<int64_t> &int_vec);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index eaa25c5d73dde..4d3f0222de40c 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -289,6 +289,21 @@ bool ConcatOpInferSymbolicShape(
   axis = axis >= 0 ? axis : std::max(int64_t(0), int64_t(axis + rank));
 
   if (shape_data_list[0].data().has_value()) {
+    if (rank == 1) {
+      ExprVec data = details::GetExprVecFromData(
+          shape_analysis->GetShapeOrDataForValue(operand_source));
+      const std::vector<symbol::DimExpr> shape{std::int64_t(data.size())};
+      symbol::ShapeOrDataDimExprs shape_data{
+          symbol::TensorShapeOrDataDimExprs(shape, data)};
+      pir::Value res = op->result(0);
+      shape_analysis->SetShapeOrDataForValue(res, shape_data);
+
+      return true;
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          op->name() +
+          " 's InferSymbolicShape can NOT deal with rank > 1 now."));
+    }
     std::vector<symbol::DimExpr> data;
     data.reserve(shape_data_list.size());
     for (auto &data_elem : shape_data_list) {
@@ -436,9 +451,9 @@ bool SqueezeOpInferSymbolicShape(
         if (in_dims_sym[current] == 1) {
           should_squeeze[current] = true;
         } else if (!in_dims_sym[current].Has<std::int64_t>()) {
-          PADDLE_THROW(
-              phi::errors::Unimplemented("SqueezeOpInferSymbolicShape CAN NOT "
-                                         "deal with symbol in axis now"));
+          should_squeeze[current] = true;
+        } else {
+          should_squeeze[current] = true;
         }
       }
     }
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
index f6d45dad1956a..3bcfa99611568 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
@@ -379,14 +379,7 @@ bool Sinh_OpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
 }
-bool SubtractOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return SameOperandsAndResultShape(op, shape_analysis);
-}
-bool Subtract_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return SameOperandsAndResultShape(op, shape_analysis);
-}
+
 bool TanOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
index 6afe08d753a55..9e906f6b17ad2 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
@@ -105,8 +105,6 @@ OP_DECLARE_INFER_SYMBOLIC_SHAPE(Sin)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Sin_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Sinh)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Sinh_)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Subtract)
-OP_DECLARE_INFER_SYMBOLIC_SHAPE(Subtract_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Tan)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Tan_)
 OP_DECLARE_INFER_SYMBOLIC_SHAPE(Tanh)
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
index 42067e28e310a..6d0fd014d62e7 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/unary_infer_sym.cc
@@ -165,6 +165,7 @@ bool Cumsum_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return CumsumOpInferSymbolicShape(op, shape_analysis);
 }
+
 bool DiagEmbedOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   pir::Value operand_source = op->operand_source(0);
@@ -280,6 +281,7 @@ bool KthvalueOpInferSymbolicShape(
   shape_analysis->SetShapeOrDataForValue(op->result(1), shape_data);
   return true;
 }
+
 bool ReshapeOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   pir::Value operand_source = op->operand_source(0);
@@ -329,10 +331,11 @@ bool ReshapeOpInferSymbolicShape(
     const auto &numel =
         GetProduct(original_shape, [](const auto &) { return true; });
 
+    ExprVec target_shape = details::GetExprVecFromData(operand_shape_or_data);
     const auto &product_exclude_minus_one =
-        GetProduct(operand_shape_or_data.data().value(), IsNotMinusOne);
+        GetProduct(target_shape, IsNotMinusOne);
 
-    const auto &input_dims = operand_shape_or_data.data().value();
+    const auto &input_dims = target_shape;
 
     std::vector<symbol::DimExpr> out_dims;
     out_dims.reserve(input_dims.size());
diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.cc b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
index 374655da35ef4..b7b04ff663133 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.cc
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
@@ -16,6 +16,7 @@
 #include "paddle/common/flags.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/include/core/dialect.h"
+#include "paddle/pir/include/core/ir_printer.h"
 #include "paddle/pir/include/dialect/shape/ir/shape_attribute.h"
 #include "paddle/pir/include/dialect/shape/ir/shape_dialect.h"
 #include "paddle/pir/include/pass/pass_manager.h"
@@ -31,22 +32,79 @@ namespace {
 using PassPipelineRunner =
     std::function<bool(pir::PassManager&, pir::ModuleOp)>;
 
-void PrintProgram(pir::ModuleOp m, std::string mgs) {
+void PrintProgram(pir::ModuleOp m, std::string msg) {
   ShapeConstraintIRAnalysis& shape_analysis =
       ShapeAnalysisManager::Instance().Get(m.program());
-  VLOG(vlog_level) << "===================== " << mgs
-                   << " =====================\n"
-                   << pir::CustomPrintHelper(*m.program(),
-                                             shape_analysis.PrintHook());
+  if (VLOG_IS_ON(vlog_level)) {
+    std::cerr << "===================== [ShapeDialect]" << msg
+              << " =====================\n"
+              << pir::CustomPrintHelper(*m.program(),
+                                        shape_analysis.PrintHook())
+              << std::endl;
+  }
+}
+
+std::string PrintOperationWithNoRegion(Operation* op) {
+  std::ostringstream os;
+  pir::IrPrinter printer(os);
+
+  // print OpResults
+  os << "(";
+  auto num_op_result = op->num_results();
+  for (size_t idx = 0; idx < num_op_result; idx++) {
+    os << "%op_" << op->id() << "_" << idx;
+    if (idx < num_op_result - 1) os << ", ";
+  }
+  os << ")";
+
+  os << " =";
+
+  // print OpName & OpId
+  os << " \"" << op->name() << "(op_" << op->id() << ")"
+     << "\"";
+
+  // print OpOperands
+  os << " (";
+  auto num_op_operands = op->num_operands();
+  for (size_t idx = 0; idx < num_op_operands; idx++) {
+    const pir::Value& input = op->operand_source(idx);
+    if (input.defining_op()) {
+      os << "op_" << input.defining_op()->id() << "_"
+         << input.dyn_cast<pir::OpResult>().index();
+    } else {
+      os << "op_NULL";
+    }
+    if (idx < num_op_operands - 1) os << ", ";
+  }
+  os << ")";
+
+  printer.PrintAttributeMap(op);
+  os << " :";
+
+  // PrintOpSignature
+  printer.PrintOperandsType(op);
+  os << " -> ";
+
+  printer.PrintOpReturnType(op);
+
+  return os.str();
+}
+
+void PrintOpInfo(pir::Operation* op) {
+  if (VLOG_IS_ON(vlog_level)) {
+    VLOG(vlog_level) << op->name() << "(op_id: op_" << op->id()
+                     << ", num_results=" << op->num_results() << ")"
+                     << " has InferSymbolicShapeInterface.\n\t"
+                     << PrintOperationWithNoRegion(op);
+  }
 }
 
 void DebugPrintOpInfo(
     pir::Operation* op,
     pir::ShapeConstraintIRAnalysis* shape_analysis = nullptr) {
+  std::ostringstream print_stream;
   for (auto& res : op->results()) {
-    std::ostringstream print_stream;
-
-    print_stream << "  result(" << res.dyn_cast<pir::OpResult>().index() << ") "
+    print_stream << "\tresult(" << res.dyn_cast<pir::OpResult>().index() << ") "
                  << "ShapeOrData: {";
 
     if (shape_analysis != nullptr) {
@@ -78,8 +136,10 @@ void DebugPrintOpInfo(
 
       print_stream << "]";
     }
-    print_stream << " }";
-    VLOG(vlog_level) << print_stream.str();
+    print_stream << " }\n";
+  }
+  if (VLOG_IS_ON(vlog_level)) {
+    std::cerr << print_stream.str();
   }
 }
 
@@ -131,8 +191,7 @@ void InferSymExprForBlock(const Block& block,
     auto infer_symbolic_shape_interface =
         op.dyn_cast<paddle::dialect::InferSymbolicShapeInterface>();
     if (infer_symbolic_shape_interface) {
-      VLOG(vlog_level) << op.name() << "(op_id: op_" << op.id() << ")"
-                       << " has InferSymbolicShapeInterface.";
+      PrintOpInfo(&op);
       PADDLE_ENFORCE_EQ(
           infer_symbolic_shape_interface.InferSymbolicShape(shape_analysis),
           true,

From 06f1abf8be0c210ef082a273c41931bdec4aa0e8 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 8 Mar 2024 10:46:28 +0800
Subject: [PATCH 071/114] [CINN] Fix some bug of cinn (#62540)

* [PIR] Filter out attribute `op_callstack` when print program

* fix some bug of cinn

* polish code

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>
---
 paddle/cinn/hlir/framework/pir/group.cc             |  1 -
 test/ir/pir/cinn/inference/test_llama_while.py      |  7 +++----
 .../cinn/symbolic/test_cinn_broadcast_symbolic.py   | 13 +++++++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/paddle/cinn/hlir/framework/pir/group.cc b/paddle/cinn/hlir/framework/pir/group.cc
index 7cef409f9cad2..c209f2301bf95 100644
--- a/paddle/cinn/hlir/framework/pir/group.cc
+++ b/paddle/cinn/hlir/framework/pir/group.cc
@@ -52,7 +52,6 @@ std::shared_ptr<Group> Group::Clone(::pir::Block* target_block,
 
   new_group->input_names = this->input_names;
   new_group->output_names = this->output_names;
-  new_group->output_values = this->output_values;
   new_group->fn_name = this->fn_name;
   new_group->int_args_map = this->int_args_map;
   new_group->alignment_schedule_info = this->alignment_schedule_info;
diff --git a/test/ir/pir/cinn/inference/test_llama_while.py b/test/ir/pir/cinn/inference/test_llama_while.py
index d0197dd7041b4..0afa041f5baa3 100644
--- a/test/ir/pir/cinn/inference/test_llama_while.py
+++ b/test/ir/pir/cinn/inference/test_llama_while.py
@@ -34,12 +34,11 @@ def __init__(self):
     def forward(self, logits, input_ids):
         batch_size, cur_len = paddle.shape(input_ids)
         unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
-        max_new_tokens = paddle.full([1], 4, dtype="int64")
+        max_new_tokens = paddle.full([1], 16, dtype="int64")
         while cur_len < max_new_tokens and paddle.any(unfinished_flag):
             last_token = input_ids[:, -1]
             # [batch_size, vocab_size]
-            logits = logits[:, -1, :]
-            probs = F.softmax(logits)
+            probs = F.softmax(logits[:, -1, :])
 
             # compute next_tokens
             top_ps_tensor = paddle.full(
@@ -61,7 +60,7 @@ def setUp(self):
 
     def prepare_data(self):
         self.logits = paddle.randn([1, 256, 3200], dtype="float32")
-        self.input_ids = paddle.randint(0, 512, [1, 32], dtype="int64")
+        self.input_ids = paddle.randint(0, 512, [1, 8], dtype="int64")
 
     def check_jit_kernel_info(self, static_fn):
         utils.check_jit_kernel_number(static_fn, 1)
diff --git a/test/ir/pir/cinn/symbolic/test_cinn_broadcast_symbolic.py b/test/ir/pir/cinn/symbolic/test_cinn_broadcast_symbolic.py
index 96f8fbfebd24b..dde162765ea64 100644
--- a/test/ir/pir/cinn/symbolic/test_cinn_broadcast_symbolic.py
+++ b/test/ir/pir/cinn/symbolic/test_cinn_broadcast_symbolic.py
@@ -57,8 +57,17 @@ def prepare_data(self):
         self.y.stop_gradient = False
 
     def check_jit_kernel_info(self, static_fn):
-        utils.check_jit_kernel_number(static_fn, 1)
-        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+        utils.check_jit_kernel_number(static_fn, 3)
+        utils.check_jit_kernel_structure(
+            static_fn,
+            {
+                'if_0': {utils.JIT_KERNEL_NAME: 1},
+                'else_0': {
+                    'if_0_0': {utils.JIT_KERNEL_NAME: 1},
+                    'else_0_0': {utils.JIT_KERNEL_NAME: 1},
+                },
+            },
+        )
 
     def eval_symbolic(self, use_cinn):
         paddle.seed(2022)

From 12570594f2e034cdf9d5a85e36dd4849bab87fc6 Mon Sep 17 00:00:00 2001
From: caozhou <48191911+Caozhou1995@users.noreply.github.com>
Date: Fri, 8 Mar 2024 10:55:49 +0800
Subject: [PATCH 072/114] [AutoTuner] support refined recompute in autotuner
 (#62430)

* support refined recompute in autotuner

* fix pp prune bug

* update rr autotuner

* add rr resume

* fix rr prune bug

* fix rr prune history bug

* fix rr pp prune bug
---
 python/paddle/distributed/auto_tuner/prune.py | 104 ++++++-
 .../paddle/distributed/auto_tuner/search.py   |   4 +-
 python/paddle/distributed/auto_tuner/tuner.py |   5 +
 python/paddle/distributed/auto_tuner/utils.py | 254 +++++++++++++++++-
 python/paddle/distributed/launch/main.py      |  52 ++--
 5 files changed, 372 insertions(+), 47 deletions(-)

diff --git a/python/paddle/distributed/auto_tuner/prune.py b/python/paddle/distributed/auto_tuner/prune.py
index 51db43f66a05e..e87d3adc6a74f 100644
--- a/python/paddle/distributed/auto_tuner/prune.py
+++ b/python/paddle/distributed/auto_tuner/prune.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import logging
 import os
 import subprocess
@@ -21,8 +22,8 @@
 _PRUNE_HISTORY_FUNC = []
 
 
-def log_pruned_info(cur_cfg, pruned_reason):
-    pruned_strategy = "DP{}_MP{}_PP{}_VPP_{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}".format(
+def log_pruned_info(cur_cfg, pruned_reason, tuner_cfg):
+    pruned_strategy = "DP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}".format(
         cur_cfg["dp_degree"],
         cur_cfg["mp_degree"],
         cur_cfg["pp_degree"],
@@ -33,6 +34,11 @@ def log_pruned_info(cur_cfg, pruned_reason):
         cur_cfg["use_recompute"],
         cur_cfg["recompute_granularity"],
     )
+    if "refined_recompute" in tuner_cfg:
+        for key in tuner_cfg["refined_recompute"]:
+            strategy = "".join(i.capitalize() for i in key.split("_"))
+            strategy += str(cur_cfg[key])
+            pruned_strategy = pruned_strategy + "_" + strategy
 
     try:
         from paddle.distributed.launch.main import ctx
@@ -215,7 +221,7 @@ def prune_by_mp_pp_history(tuner_cfg, cur_cfg, history_cfgs, pruned_cfgs):
                 and cfg.get("max_mem_usage") == "OOM"
             ):
                 pruned_reason = f"mp_degree {mp_degree}, pp_degree {pp_degree} may cause oom because {cfg['mp_degree']}, {cfg['pp_degree']} already oom."
-                log_pruned_info(cur_cfg, pruned_reason)
+                log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
                 cur_cfg["max_mem_usage"] = "OOM"
                 return True
 
@@ -292,7 +298,7 @@ def prune_by_vpp_history(tuner_cfg, cur_cfg, history_cfgs=[], pruned_cfgs=[]):
                 and cfg.get("max_mem_usage") == "OOM"
             ):
                 pruned_reason = f"vpp_degree {vpp_degree} may cause oom because { cfg['vpp_degree']} already oom."
-                log_pruned_info(cur_cfg, pruned_reason)
+                log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
                 cur_cfg["max_mem_usage"] = "OOM"
                 return True
 
@@ -336,9 +342,12 @@ def prune_by_mbs(tuner_cfg, cur_cfg, history_cfgs=[]):
         if local_batch_size % micro_batch_size != 0:
             return True
         acc_steps = local_batch_size // micro_batch_size
+        pp_degree = cur_cfg.get("pp_degree", None)
+        if pp_degree is not None:
+            if acc_steps < pp_degree:
+                return True
         vpp_degree = cur_cfg.get("vpp_degree", None)
         if vpp_degree is not None and vpp_degree > 1:
-            pp_degree = cur_cfg.get("pp_degree", None)
             if pp_degree is not None:
                 if acc_steps % pp_degree != 0:
                     return True
@@ -375,7 +384,7 @@ def prune_by_mbs_history(tuner_cfg, cur_cfg, history_cfgs=[], pruned_cfgs=[]):
                 and cfg.get("time", -1) > 0
             ):
                 pruned_reason = f"micro_batch_size {micro_batch_size} may be slower because {cfg['micro_batch_size']} has been already runnable."
-                log_pruned_info(cur_cfg, pruned_reason)
+                log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
                 cur_cfg["time"] = cfg["time"]
                 return True
             # memory prune
@@ -384,7 +393,7 @@ def prune_by_mbs_history(tuner_cfg, cur_cfg, history_cfgs=[], pruned_cfgs=[]):
                 and cfg.get("max_mem_usage") == "OOM"
             ):
                 pruned_reason = f"micro_batch_size {micro_batch_size} may cause oom because {cfg['micro_batch_size']} already oom."
-                log_pruned_info(cur_cfg, pruned_reason)
+                log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
                 cur_cfg["max_mem_usage"] = "OOM"
                 return True
     return False
@@ -459,7 +468,7 @@ def prune_by_sharding_history(
                 and cfg.get("time", -1) > 0
             ):
                 pruned_reason = f"sharding_stage {sharding_stage} may be slower because {cfg['sharding_stage'] } has been already runnable."
-                log_pruned_info(cur_cfg, pruned_reason)
+                log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
                 cur_cfg["time"] = cfg["time"]
                 return True
 
@@ -469,7 +478,7 @@ def prune_by_sharding_history(
                 and cfg.get("max_mem_usage") == "OOM"
             ):
                 pruned_reason = f"sharding_stage {sharding_stage} may cause oom because {cfg['sharding_stage']} already oom."
-                log_pruned_info(cur_cfg, pruned_reason)
+                log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
                 cur_cfg["max_mem_usage"] = "OOM"
                 return True
 
@@ -567,7 +576,7 @@ def prune_by_recompute_history(
                 and cfg.get("time", -1) > 0
             ):
                 pruned_reason = f"use_recompute may be slower because {cfg['use_recompute']} has been already runnable."
-                log_pruned_info(cur_cfg, pruned_reason)
+                log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
                 cur_cfg["time"] = cfg["time"]
                 return True
 
@@ -576,7 +585,7 @@ def prune_by_recompute_history(
                 and cfg.get("max_mem_usage") == "OOM"
             ):
                 pruned_reason = f"use_recompute may cause oom because {cfg['use_recompute']} already oom."
-                log_pruned_info(cur_cfg, pruned_reason)
+                log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
                 cur_cfg["max_mem_usage"] = "OOM"
                 return True
 
@@ -816,3 +825,76 @@ def prune_by_invalid_strategy(tuner_cfg, cur_cfg, history_cfgs=[]):
             return True
 
     return False
+
+
+@register_prune
+def prune_by_refined_recompute(tuner_cfg, cur_cfg, history_cfgs=[]):
+    if tuner_cfg.get("refined_recompute", None):
+        rr = tuner_cfg.get("refined_recompute")
+        pp_degree = cur_cfg["pp_degree"]
+        recompute = cur_cfg["use_recompute"]
+        recompute_granularity = cur_cfg["recompute_granularity"]
+        compare = [cur_cfg[item] for item in rr]
+        if recompute:
+            if recompute_granularity and recompute_granularity != "full":
+                if compare.count(0) != len(compare):
+                    return True
+        if pp_degree == 1 and compare.count(0) != len(compare):
+            return True
+        if tuner_cfg["model_cfg"]["num_layers"] % pp_degree != 0:
+            return True
+        max_value = tuner_cfg["model_cfg"]["num_layers"] / pp_degree
+        if cur_cfg[rr[0]] > max_value:
+            return True
+        i = 1
+        while i < len(rr):
+            if cur_cfg[rr[i]] > max_value or (
+                cur_cfg[rr[i - 1]] != max_value and cur_cfg[rr[i]] != 0
+            ):
+                return True
+            i += 1
+
+    return False
+
+
+@register_prune_history
+def prune_by_refined_recompute_history(
+    tuner_cfg, cur_cfg, history_cfgs=[], pruned_cfgs=[]
+):
+    if tuner_cfg.get("refined_recompute", None):
+        history_cfgs.extend(pruned_cfgs)
+        rr = tuner_cfg.get("refined_recompute")
+        compare = copy.deepcopy(rr)
+        compare.append("use_recompute")
+        cfgs = same_cfgs_beside(compare, cur_cfg, history_cfgs)
+        for item in rr:
+            if cfgs:
+                for cfg in cfgs:
+                    if not cfg["use_recompute"] and cfg.get("time", -1) > 0:
+                        pruned_reason = f"{item} {cur_cfg[item]} may be slower because not recompute has been already runnable."
+                        log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
+                        cur_cfg["time"] = cfg["time"]
+                        return True
+                    if (
+                        cfg[item] > cur_cfg[item]
+                        and cfg.get("time", -1) > 0
+                        and cfg["use_recompute"]
+                        and cur_cfg["use_recompute"]
+                    ):
+                        pruned_reason = f"{item} {cur_cfg[item]} may be slower because {cfg[item]} has been already runnable."
+                        log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
+                        cur_cfg["time"] = cfg["time"]
+                        return True
+                    # memory prune
+                    if (
+                        cfg[item] < cur_cfg[item]
+                        and cfg.get("max_mem_usage") == "OOM"
+                        and cfg["use_recompute"]
+                        and cur_cfg["use_recompute"]
+                    ):
+                        pruned_reason = f"{item} {cur_cfg[item]} may cause oom because {cfg[item]} already oom."
+                        log_pruned_info(cur_cfg, pruned_reason, tuner_cfg)
+                        cur_cfg["max_mem_usage"] = "OOM"
+                        return True
+
+    return False
diff --git a/python/paddle/distributed/auto_tuner/search.py b/python/paddle/distributed/auto_tuner/search.py
index 0fe26da0886f1..c4eeb7c493100 100644
--- a/python/paddle/distributed/auto_tuner/search.py
+++ b/python/paddle/distributed/auto_tuner/search.py
@@ -63,7 +63,9 @@ def search_once(self, history_cfgs):
         stop = False
         if history_cfgs:
             if history_cfgs[-1].get("time", -1) > 0:
-                if self.baseline is None:
+                if self.baseline is None and self.tuner_cfg.get(
+                    "need_baseline", False
+                ):
                     from .utils import performance_sort
 
                     self.baseline = history_cfgs[-1]
diff --git a/python/paddle/distributed/auto_tuner/tuner.py b/python/paddle/distributed/auto_tuner/tuner.py
index 6a6a0ba4e082f..894ba6217a6f2 100644
--- a/python/paddle/distributed/auto_tuner/tuner.py
+++ b/python/paddle/distributed/auto_tuner/tuner.py
@@ -133,6 +133,11 @@ def get_cfg_from_resume(self, cur_cfg):
             'sharding_overlap',
             'acc_steps',
         ]
+
+        if self.tuner_cfg.get("refined_recompute", None):
+            for rr in self.tuner_cfg["refined_recompute"]:
+                keys_to_compare.append(rr)
+
         for cfg in self.resume_cfgs:
             ret_is_same = True
             for key in keys_to_compare:
diff --git a/python/paddle/distributed/auto_tuner/utils.py b/python/paddle/distributed/auto_tuner/utils.py
index 153e4156b03f5..aebc45c3e0817 100644
--- a/python/paddle/distributed/auto_tuner/utils.py
+++ b/python/paddle/distributed/auto_tuner/utils.py
@@ -296,6 +296,21 @@ def default_candidates(tuner_cfg):
             f"recompute_granularity only supports auto/{'/'.join(__SUPPORTED_RECOMPUTE_GRANULARITY__)}, but got {recompute_granularity}"
         )
 
+    # add refine recompute default values
+    refined_recompute = tuner_cfg.get("refined_recompute", None)
+    if refined_recompute is not None:
+        candidates["refined_recompute"] = {}
+        assert isinstance(refined_recompute, list)
+        for op_type in refined_recompute:
+            assert isinstance(op_type, str)
+            if schedule_mode == "performance":
+                candidates["refined_recompute"][op_type] = list(
+                    range(tuner_cfg["model_cfg"]["num_layers"] + 1, -1, -1)
+                )
+            else:
+                candidates["refined_recompute"][op_type] = list(
+                    range(tuner_cfg["model_cfg"]["num_layers"] + 1)
+                )
     return candidates
 
 
@@ -312,6 +327,7 @@ def search_all(tuner_cfg):
     sharding_degree_candidates = candidates["sharding_degree"]
     use_recompute_candidates = candidates["use_recompute"]
     recompute_granularity_candidates = candidates["recompute_granularity"]
+    refine_recompute_candidates = candidates.get("refined_recompute", None)
 
     num_gpus = (
         tuner_cfg["num_gpus"]
@@ -360,6 +376,14 @@ def search_all(tuner_cfg):
         )
     )
 
+    rr_dim_cfgs = None
+    if refine_recompute_candidates is not None:
+        rr = tuner_cfg["refined_recompute"]
+        rr_list = []
+        for op_type in rr:
+            rr_list.append(refine_recompute_candidates[op_type])
+        rr_dim_cfgs = list(itertools.product(*rr_list))
+
     all_cfgs = []
     for valid_degree in valid_degrees:
         for other_dim_cfg in other_dim_cfgs:
@@ -379,8 +403,49 @@ def search_all(tuner_cfg):
                 continue
             if tuner_cfg["model_cfg"]["num_layers"] % (pp_degree * vpp) != 0:
                 continue
-            cfg = list(valid_degree) + list(other_dim_cfg)
-            all_cfgs.append(cfg)
+
+            if rr_dim_cfgs:
+                for rr_dim_cfg in rr_dim_cfgs:
+                    skip = False
+                    if (
+                        (pp_degree == 1)
+                        or (not use_recompute)
+                        or (use_recompute and recompute_granularity != "full")
+                    ):
+                        if list(rr_dim_cfg).count(0) != len(rr_dim_cfg):
+                            skip = True
+
+                    max_value = tuner_cfg["model_cfg"]["num_layers"] / pp_degree
+                    if rr_dim_cfg[0] > max_value:
+                        skip = True
+                    i = 1
+                    while i < len(rr_dim_cfg):
+                        if (
+                            rr_dim_cfg[i - 1] != max_value
+                            and rr_dim_cfg[i] != 0
+                        ) or rr_dim_cfg[i] > max_value:
+                            skip = True
+                            break
+                        i += 1
+                    if skip:
+                        cfg = (
+                            list(valid_degree)
+                            + list(other_dim_cfg)
+                            + [0 for i in range(len(rr_dim_cfg))]
+                        )
+                        if cfg not in all_cfgs:
+                            all_cfgs.append(cfg)
+                    else:
+                        cfg = (
+                            list(valid_degree)
+                            + list(other_dim_cfg)
+                            + list(rr_dim_cfg)
+                        )
+                        if cfg not in all_cfgs:
+                            all_cfgs.append(cfg)
+            else:
+                cfg = list(valid_degree) + list(other_dim_cfg)
+                all_cfgs.append(cfg)
 
     mapping = {
         0: "mp_degree",
@@ -393,13 +458,17 @@ def search_all(tuner_cfg):
         7: "use_recompute",
         8: "recompute_granularity",
     }
+
+    if refine_recompute_candidates is not None:
+        rr = tuner_cfg["refined_recompute"]
+        for dim in rr:
+            mapping[len(mapping)] = dim
     new_all_cfgs = []
     for cfg in all_cfgs:
         new_cfg = {}
         for idx, val in enumerate(cfg):
             new_cfg[mapping[idx]] = val
         new_all_cfgs.append(new_cfg)
-
     search_space_size_before_prune = len(new_all_cfgs)
     pruned_all_cfgs = []
     tuner_cfg["num_gpus"] = num_gpus
@@ -712,6 +781,103 @@ def add_overlap_performance(cur_cfg, tuner_cfg, history_cfgs):
                     raw_cfg[mew_key] = round(raw_cfg[key] * (1 + ratio), 5)
 
 
+def gen_sharding_overlap_args_of_grid_search(res_args, cfg, tuner_cfg):
+    """Generate args of sharding overlap."""
+    if "sharding_overlap" not in tuner_cfg["search_algo"]:
+        return
+    cmd = copy.deepcopy(tuner_cfg["search_algo"]["sharding_overlap"])
+    valid_hybrid_strategy = [
+        "sharding_mp",
+        "sharding_pp",
+        "sharding_mp_pp",
+        "no_overlap",
+    ]
+    for key in cmd:
+        if key not in valid_hybrid_strategy:
+            raise ValueError(
+                f"Only support {valid_hybrid_strategy}, but got {key}."
+            )
+    sharding_degree = cfg["sharding_degree"]
+    mp_degree = cfg["mp_degree"]
+    pp_degree = cfg["pp_degree"]
+    arg = None
+    if mp_degree > 1 and pp_degree == 1 and sharding_degree > 1:
+        arg = "sharding_mp"
+    elif mp_degree == 1 and pp_degree > 1 and sharding_degree > 1:
+        arg = "sharding_pp"
+    elif mp_degree > 1 and pp_degree > 1 and sharding_degree > 1:
+        arg = "sharding_mp_pp"
+    else:
+        arg = "no_overlap"
+    assert arg is not None
+    if arg in cmd:
+        if "--" in cmd[arg][0]:
+            arg_map_len = len(cmd[arg])
+            assert arg_map_len % 2 == 0
+            i = 0
+            while i < arg_map_len:
+                new_arg = [cmd[arg][i], str(cmd[arg][i + 1])]
+                res_args.extend(new_arg)
+                i += 2
+        elif "-o" in cmd[arg][0]:
+            res_args.extend(cmd[arg])
+        elif ".json" in cmd[arg][0]:
+            import json
+
+            file_path = cmd[arg][0]
+            try:
+                with open(file_path, "r") as f:
+                    cmd_cfg = json.load(f)
+            except:
+                raise ValueError(
+                    "Please check your auto tuner json whether valid."
+                )
+            keys = cmd[arg][1].split(".")
+            value = None
+            for key in keys[: len(keys) - 1]:
+                if value:
+                    value = value[key]
+                else:
+                    value = cmd_cfg[key]
+            if value:
+                value[keys[-1]] = cmd[arg][2]
+            else:
+                cmd_cfg[keys[-1]] = cmd[arg][2]
+            json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+
+        elif ".yaml" in cmd[arg][0]:
+            import yaml
+
+            file_path = cmd[arg][0]
+            try:
+                with open(file_path, "r") as f:
+                    cmd_cfg = yaml.safe_load(f)
+            except:
+                raise ValueError(
+                    "Please check your auto tuner json whether valid."
+                )
+            arg_map_len = len(cmd[arg]) - 1
+            assert arg_map_len % 2 == 0
+
+            i = 1
+            while i < arg_map_len:
+                keys = cmd[arg][i].split(".")
+                value = None
+                for key in keys[: len(keys) - 1]:
+                    if value:
+                        value = value[key]
+                    else:
+                        value = cmd_cfg[key]
+                if value:
+                    i += 1
+                    value[keys[-1]] = cmd[arg][i]
+                else:
+                    i += 1
+                    cmd_cfg[keys[-1]] = cmd[arg][i]
+                i += 1
+            yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
+
+
 def gen_sharding_overlap_args(res_args, cfg, tuner_cfg):
     """Generate args of sharding overlap."""
     if "sharding_overlap" not in tuner_cfg["search_algo"]:
@@ -1225,6 +1391,82 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                     )
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
+        elif arg == "refined_recompute" and arg in cmd:
+            if "--" in cmd["refined_recompute"][0]:
+                raise NotImplementedError(
+                    "refined recompute is not supported by command in autotuner."
+                )
+            elif "-o" in cmd["refined_recompute"][0]:
+                raise NotImplementedError(
+                    "refined recompute is not supported by '-o' in autotuner."
+                )
+            elif ".json" in cmd[arg][0]:
+                import json
+
+                file_path = cmd[arg][0]
+                if len(cmd[arg]) >= 3:
+                    raise ValueError(
+                        "The 3rd arg is not supported in refined_recompute"
+                    )
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = json.load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                value = None
+                rr_values = {}
+                rr = tuner_cfg.get("refined_recompute", None)
+                if not rr:
+                    return
+                for key in rr:
+                    rr_values[key] = cfg[key]
+                for key in keys[: len(keys) - 1]:
+                    if not value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = rr_values
+                else:
+                    cmd_cfg[keys[-1]] = rr_values
+                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+            elif ".yaml" in cmd[arg][0]:
+                import yaml
+
+                file_path = cmd[arg][0]
+                if len(cmd[arg]) >= 3:
+                    raise ValueError(
+                        "The 3rd arg is not supported in refined_recompute"
+                    )
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = yaml.safe_load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                value = None
+                rr_values = {}
+                rr = tuner_cfg.get("refined_recompute", None)
+                if not rr:
+                    return
+                for key in rr:
+                    rr_values[key] = cfg[key]
+                for key in keys[: len(keys) - 1]:
+                    if not value:
+                        value = cmd_cfg[key]
+                    else:
+                        value = value[key]
+                if value:
+                    value[keys[-1]] = rr_values
+                else:
+                    cmd_cfg[keys[-1]] = rr_values
+                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
+
     assert "run_cmd" in tuner_cfg
     cmd = copy.deepcopy(tuner_cfg["run_cmd"])
     res_args = copy.deepcopy(raw_args)
@@ -1242,6 +1484,7 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
     _gen_new_arg("gradient_accumulation_steps", cmd, cfg, res_args, tuner_cfg)
     _gen_new_arg("global_batch_size", cmd, cfg, res_args, tuner_cfg)
     _gen_new_arg("sequence_parallel", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("refined_recompute", cmd, cfg, res_args, tuner_cfg)
 
     if tuner_cfg["run_cmd"].get("search_stage", None) and not run_best:
         cmd = copy.deepcopy(tuner_cfg["run_cmd"]["search_stage"])
@@ -1352,7 +1595,10 @@ def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
                 yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
     # sharding overlap args
-    gen_sharding_overlap_args(res_args, cfg, tuner_cfg)
+    if tuner_cfg["search_algo"]["name"] == "grid":
+        gen_sharding_overlap_args_of_grid_search(res_args, cfg, tuner_cfg)
+    else:
+        gen_sharding_overlap_args(res_args, cfg, tuner_cfg)
 
     return res_args
 
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index ee4987e22888f..2621de6a86c72 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -627,38 +627,28 @@ def launch():
             job_id += 1
             task_job_id = "auto_tuner_" + str(job_id)
             ctx.args.job_id = task_job_id
-
+            log_dir = "Job{}_GBS{}_DP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}_AccStep{}".format(
+                job_id,
+                global_batch_size,
+                cur_cfg["dp_degree"],
+                cur_cfg["mp_degree"],
+                cur_cfg["pp_degree"],
+                cur_cfg["vpp_degree"],
+                cur_cfg["sharding_degree"],
+                cur_cfg["sharding_stage"],
+                cur_cfg["micro_batch_size"],
+                cur_cfg["use_recompute"],
+                cur_cfg["recompute_granularity"],
+                cur_cfg["acc_steps"],
+            )
             if "sharding_overlap" in cur_cfg:
-                log_dir = "Job{}_GBS{}_DP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}_AccStep{}_Overlap_{}".format(
-                    job_id,
-                    global_batch_size,
-                    cur_cfg["dp_degree"],
-                    cur_cfg["mp_degree"],
-                    cur_cfg["pp_degree"],
-                    cur_cfg["vpp_degree"],
-                    cur_cfg["sharding_degree"],
-                    cur_cfg["sharding_stage"],
-                    cur_cfg["micro_batch_size"],
-                    cur_cfg["use_recompute"],
-                    cur_cfg["recompute_granularity"],
-                    cur_cfg["acc_steps"],
-                    cur_cfg["sharding_overlap"],
-                )
-            else:
-                log_dir = "Job{}_GBS{}_DP{}_MP{}_PP{}_VPP{}_Sharding{}_Stage{}_MBS{}_Recompute_{}_Granularity_{}_AccStep{}".format(
-                    job_id,
-                    global_batch_size,
-                    cur_cfg["dp_degree"],
-                    cur_cfg["mp_degree"],
-                    cur_cfg["pp_degree"],
-                    cur_cfg["vpp_degree"],
-                    cur_cfg["sharding_degree"],
-                    cur_cfg["sharding_stage"],
-                    cur_cfg["micro_batch_size"],
-                    cur_cfg["use_recompute"],
-                    cur_cfg["recompute_granularity"],
-                    cur_cfg["acc_steps"],
-                )
+                log_dir = log_dir + f"_Overlap_{cur_cfg['sharding_overlap']}"
+            if "refined_recompute" in tuner_cfg:
+                for key in tuner_cfg["refined_recompute"]:
+                    dir_name = "".join(i.capitalize() for i in key.split("_"))
+                    dir_name += str(cur_cfg[key])
+                    log_dir = log_dir + "_" + dir_name
+
             ctx.args.log_dir = os.path.join(
                 os.path.dirname(ctx.args.auto_tuner_json), log_dir
             )

From 03344d8ec5061d0f1e321a596d075e9a62cbd5f1 Mon Sep 17 00:00:00 2001
From: NeroLoh <745827440@qq.com>
Date: Fri, 8 Mar 2024 11:01:53 +0800
Subject: [PATCH 073/114] [PHI]Support set need_prepare_phi_data by env
 (#62519)

---
 paddle/fluid/framework/operator.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index afe442c0a7c6f..51780c05150aa 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1704,6 +1704,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     all_kernels_must_compute_runtime_shape_ = true;
   const Scope* cur_scope = &scope;
   CheckWhetherPreparePhiData(Inputs(), Outputs(), scope);
+#if defined(PADDLE_WITH_XPU)
+  if (std::getenv("XPU_NEED_PREPARE_PHI_DATA") != nullptr) {
+    need_prepare_phi_data_ = atoi(std::getenv("XPU_NEED_PREPARE_PHI_DATA"));
+  }
+#endif
   if (!enable_cache_runtime_context_) {
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);

From 8a523eef8d8069c8124179c2768c1d3a079649db Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 8 Mar 2024 11:17:20 +0800
Subject: [PATCH 074/114] skip prepare_op_amp_options in build_program when pir
 is used (#62528)

---
 .../distributed/auto_parallel/static/helper.py  | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py
index 50b67e0cbb946..99f9343871768 100644
--- a/python/paddle/distributed/auto_parallel/static/helper.py
+++ b/python/paddle/distributed/auto_parallel/static/helper.py
@@ -260,11 +260,18 @@ def build_program(self, mode):
         concrete_program = getattr(
             self.proxy_layer, func_name
         ).concrete_program  # noqa: B018
-        prepare_op_amp_options(
-            concrete_program.main_program,
-            ProgramTranslator.get_instance()._amp_records,
-            DEFAULT_AMP_OPTIONS,
-        )
+
+        # TODO(zhiqiu): prepare_op_amp_options is not supported for PIR program
+        # It will to use dynamic-static unified amp in pir program, and there is
+        # no need to fit for prepare_op_amp_options
+        if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
+        ]:
+            prepare_op_amp_options(
+                concrete_program.main_program,
+                ProgramTranslator.get_instance()._amp_records,
+                DEFAULT_AMP_OPTIONS,
+            )
         self._build_startup_program()
 
     def _build_startup_program(self):

From 93d1e8501368883c60a002c1e976f89a25140a48 Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Fri, 8 Mar 2024 14:07:52 +0800
Subject: [PATCH 075/114] [Distributed]Earse p2p cache for every step (#62277)
 (#62400)

* [Distributed]Earse p2p cache for every step (#62277)

* earse cache

* earse cache

* earse cache

* fix conflict

* add utest
---
 paddle/fluid/framework/distributed_strategy.proto  |  1 +
 .../fleet/meta_parallel/pipeline_parallel.py       | 14 ++++++++++++++
 .../meta_parallel/pp_utils/p2p_communication.py    |  6 ++++++
 .../fleet/hybrid_parallel_shared_weight.py         |  2 ++
 4 files changed, 23 insertions(+)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 6cc52fba01236..be60529cc86d2 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -83,6 +83,7 @@ message PpConfig {
     optional bool profiling = 5 [ default = false ];
     optional bool release_gradients = 6 [ default = false ];
     optional bool overlap_p2p_comm = 7 [default = false];
+    optional bool clear_every_step_cache = 8 [default = false];
 }
 
 message DygraphShardingConfig {
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index e5233c87a199b..81f19fda76716 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -220,6 +220,10 @@ def __init__(self, layers, hcg, strategy):
             "pp_configs"
         ].overlap_p2p_comm
 
+        self._clear_every_step_cache = self._strategy.hybrid_configs[
+            "pp_configs"
+        ].clear_every_step_cache
+
         self._batch_p2p_comm = not self._overlap_p2p_comm
 
         logger.info(
@@ -602,6 +606,10 @@ def forward_backward_pipeline(
             train_loss = self._broadcast_final_loss()
         if self._enable_timer:
             self.timers("broadcast_final_loss").stop()
+
+        if self._clear_every_step_cache:
+            self._p2p_helper.clear_meta_cache()
+
         self.timer_printer()
         return train_loss
 
@@ -1674,6 +1682,9 @@ def _process_bwd_buffer(step_id, tensor):
             # else just return all intermediate output tensor for all micro steps
             train_loss = self.output_tensors
 
+        if self._clear_every_step_cache:
+            self._p2p_helper.clear_meta_cache()
+
         self.timer_printer()
         return train_loss
 
@@ -1917,5 +1928,8 @@ def forward_backward_pipeline(
             # else just return all intermediate output tensor for all micro steps
             train_loss = self.output_tensors
 
+        if self._clear_every_step_cache:
+            self._p2p_helper.clear_meta_cache()
+
         self.timer_printer()
         return train_loss
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index 667040fc94443..e71949517273f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -53,6 +53,9 @@ class SendRecvMeta:
     """Mainly used to help p2p communication context information"""
 
     def __init__(self):
+        self.init_or_erase_meta()
+
+    def init_or_erase_meta(self):
         self.send_shape_message = None
         self.send_dtype_message = None
 
@@ -661,6 +664,9 @@ def _recv_meta(self):
             self._send_recv_meta.recv_meta(_hcg.get_pipe_parallel_group())
             self._send_recv_meta.has_recv_meta = self._use_cache
 
+    def clear_meta_cache(self):
+        self._send_recv_meta.init_or_erase_meta()
+
     def recv_forward(self, pp_first_stage, sync_recv=True, batch_p2p_comm=True):
         global _timers
         if _timers is not None:
diff --git a/test/collective/fleet/hybrid_parallel_shared_weight.py b/test/collective/fleet/hybrid_parallel_shared_weight.py
index 2202d88e90723..febce22a3e914 100644
--- a/test/collective/fleet/hybrid_parallel_shared_weight.py
+++ b/test/collective/fleet/hybrid_parallel_shared_weight.py
@@ -167,6 +167,8 @@ def setUp(self):
             "accumulate_steps": batch_size // micro_batch_size,
             "micro_batch_size": micro_batch_size,
         }
+        strategy.hybrid_configs["pp_configs"].clear_every_step_cache = True
+
         fleet.init(is_collective=True, strategy=strategy)
 
     def test_pp_model(self):

From 04c96faeda8f1968847e1929093e86114294ee87 Mon Sep 17 00:00:00 2001
From: Tian <121000916+SylarTiaNII@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:18:42 +0800
Subject: [PATCH 076/114] [Distributed] fix sharding on custom devices (#62535)

---
 python/paddle/distributed/communication/reduce.py      |  9 ++++++++-
 .../dygraph_optimizer/dygraph_sharding_optimizer.py    | 10 ++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/python/paddle/distributed/communication/reduce.py b/python/paddle/distributed/communication/reduce.py
index 5ddffbda4c73b..881b2339595fe 100644
--- a/python/paddle/distributed/communication/reduce.py
+++ b/python/paddle/distributed/communication/reduce.py
@@ -123,7 +123,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
             >>> # [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
     """
     # AVG is only supported when nccl >= 2.10
-    if op == ReduceOp.AVG and paddle.base.core.nccl_version() < 21000:
+    if op == ReduceOp.AVG and (not is_avg_reduce_op_supported()):
         group = (
             paddle.distributed.collective._get_global_group()
             if group is None
@@ -201,3 +201,10 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
         )
     else:
         raise ValueError(f"Unknown parameter: {op}.")
+
+
+def is_avg_reduce_op_supported():
+    if paddle.is_compiled_with_cuda():
+        return paddle.base.core.nccl_version() >= 21000
+    else:
+        return False
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index eb09eb66ae353..2b0001ddc5c8a 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -23,7 +23,10 @@
 from paddle.base.dygraph import base as imperative_base
 from paddle.base.framework import EagerParamBase
 from paddle.distributed import fleet
-from paddle.distributed.communication.reduce import ReduceOp
+from paddle.distributed.communication.reduce import (
+    ReduceOp,
+    is_avg_reduce_op_supported,
+)
 
 from ...utils.log_util import logger
 from ...utils.tensor_fusion_helper import (
@@ -101,11 +104,10 @@ def __init__(self, optimizer, hcg):
         self.use_reduce_avg = strategy.hybrid_configs[
             'sharding_configs'
         ].use_reduce_avg
-        if self.use_reduce_avg and paddle.base.core.nccl_version() < 21000:
+        if self.use_reduce_avg and (not is_avg_reduce_op_supported()):
             self.use_reduce_avg = False
             warnings.warn(
-                "nccl reduce_avg requires nccl>=2.10.0, but current version is %s"
-                % paddle.base.core.nccl_version()
+                "nccl reduce_avg requires paddle compiled with cuda and nccl>=2.10.0, please check compilation setups."
             )
 
         pp_overlap = strategy.hybrid_configs['pp_configs'].sharding_comm_overlap

From 12666cefd41f1ef32b54a2a4f4e55694175c2863 Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:39:30 +0800
Subject: [PATCH 077/114] disable isl init in dynamic shape mode (#62521)

* disable isl init in dynamic shape mode

* delete check
---
 paddle/cinn/ir/schedule/impl/base.cc |  2 +-
 paddle/cinn/ir/tensor.cc             | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/paddle/cinn/ir/schedule/impl/base.cc b/paddle/cinn/ir/schedule/impl/base.cc
index 61632dcf2452e..1640ee2b9c849 100644
--- a/paddle/cinn/ir/schedule/impl/base.cc
+++ b/paddle/cinn/ir/schedule/impl/base.cc
@@ -40,7 +40,7 @@ void DyScheduleImpl::MergeExprs() {
   std::string primitive = "MergeExprs";
   std::ostringstream os;
   auto exprs = this->GetModule().GetExprs();
-  if (exprs.size() == 1U) return;
+  if (exprs.size() <= 1U) return;
   if (!exprs[0].As<ir::Block>()) {
     os << "Expr[0] of module_expr should be a Block!\n";
     throw IRScheduleErrorHandler(primitive, os.str(), module_expr_);
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index c2ba20487e2a8..dc19d4661fbe4 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -32,6 +32,8 @@
 #include "paddle/cinn/poly/isl_utils.h"
 #include "paddle/cinn/poly/stage.h"
 
+PD_DECLARE_bool(cinn_bucket_compile);
+
 namespace cinn {
 namespace ir {
 
@@ -689,7 +691,18 @@ ir::Tensor _Tensor_::ReshapeCopied(const std::vector<Expr> &shape,
 }
 
 Shared<poly::Stage> CreateStage(Tensor tensor) {
-  auto isl_domain = tensor->GenerateIslDomain();
+  isl::set isl_domain;
+  // We will remove isl, and the subsequent compilation process will no longer
+  // use it. But it has not been completely removed in the process. it cannot be
+  // supported here under dynamic shape. Therefore, we temporarily use fake
+  // domain.
+  if (FLAGS_cinn_bucket_compile) {
+    poly::Domain fake_domain(Context::isl_ctx(), "fake_domain", {});
+    isl_domain = fake_domain.to_isl();
+  } else {
+    isl_domain = tensor->GenerateIslDomain();
+  }
+
   return poly::Stage::New(isl_domain, tensor->body(), tensor.self());
 }
 

From 3ed3761472648ffb1b3afda1fb3e214aad8b20fd Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Fri, 8 Mar 2024 15:39:59 +0800
Subject: [PATCH 078/114] fix replace reshape op (#62552)

---
 .../hlir/dialect/operator/transforms/dynamic_reshape_pass.cc  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
index 4aef88b8dcd41..834412f83364f 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
@@ -36,10 +36,14 @@ bool ReplaceOpWithReshapeOp(pir::Operation* op,
     if (shape_analysis->HasShapeOrDataForValue(op->result(0))) {
       auto shape_info =
           shape_analysis->GetShapeOrDataForValue(op->result(0)).shape();
+      int temp_dim = -1;
 
       for (size_t i = 0; i < shape_info.size(); ++i) {
         if (shape_info[i].isa<int64_t>()) {
           shape[i] = shape_info[i].Get<int64_t>();
+        } else {
+          shape[i] = temp_dim;
+          temp_dim = 1;
         }
       }
     }

From 2c7d1892f12b4f9220692505329eb519691754f6 Mon Sep 17 00:00:00 2001
From: yulangz <53958801+yulangz@users.noreply.github.com>
Date: Fri, 8 Mar 2024 16:12:33 +0800
Subject: [PATCH 079/114] Add sub graph of stable diffusion-4 (#62510)

---
 .../test_sub_graph_stable_diffusion_18_st.py  | 299 ++++++++++++++++++
 .../test_sub_graph_stable_diffusion_19_st.py  | 110 +++++++
 .../test_sub_graph_stable_diffusion_20_st.py  |  99 ++++++
 .../test_sub_graph_stable_diffusion_21_st.py  | 110 +++++++
 4 files changed, 618 insertions(+)
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_18_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_19_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_20_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_21_st.py

diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_18_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_18_st.py
new file mode 100644
index 0000000000000..5b8f505a4fc84
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_18_st.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.conv.conv2d||method:transpose||method:flatten||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||method:reshape||method:transpose||method:reshape||method:transpose||method:reshape||method:transpose||api:paddle.tensor.linalg.matmul||method:__mul__||api:paddle.nn.functional.activation.softmax||api:paddle.tensor.linalg.matmul||method:transpose||method:reshape||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.dropout||method:__truediv__||method:__add__||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||method:reshape||method:transpose||method:reshape||method:transpose||method:reshape||method:transpose||api:paddle.tensor.linalg.matmul||method:__mul__||api:paddle.nn.functional.activation.softmax||api:paddle.tensor.linalg.matmul||method:transpose||method:reshape||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.dropout||method:__truediv__||method:__add__||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||method:chunk||api:paddle.nn.functional.activation.gelu||method:__mul__||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.common.linear||method:__add__||method:reshape||method:transpose||api:paddle.nn.functional.conv.conv2d||method:__add__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_4 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_5 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_6 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_7 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_8 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_9 = self.create_parameter(
+            shape=[10240],
+            dtype=paddle.float32,
+        )
+        self.parameter_10 = self.create_parameter(
+            shape=[1280, 1280, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_11 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_12 = self.create_parameter(
+            shape=[1280, 1280, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_13 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_14 = self.create_parameter(
+            shape=[5120, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_15 = self.create_parameter(
+            shape=[768, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_16 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_17 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_18 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_19 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_20 = self.create_parameter(
+            shape=[768, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_21 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_22 = self.create_parameter(
+            shape=[1280, 10240],
+            dtype=paddle.float32,
+        )
+        self.parameter_23 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 1280, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [], dtype: paddle.int32, stop_gradient: True)
+        var_2,  # (shape: [], dtype: paddle.int32, stop_gradient: True)
+        var_3,  # (shape: [1, 1280, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_4,  # (shape: [1, 4, 768], dtype: paddle.float32, stop_gradient: True)
+    ):
+        var_5 = paddle.nn.functional.conv.conv2d(
+            var_0, self.parameter_10, self.parameter_19, [1, 1], 0, [1, 1], 1
+        )
+        var_6 = var_5.transpose([0, 2, 3, 1])
+        var_7 = var_6.flatten(1, 2)
+        var_8 = paddle.nn.functional.norm.layer_norm(
+            var_7,
+            normalized_shape=[1280],
+            weight=self.parameter_1,
+            bias=self.parameter_16,
+            epsilon=1e-05,
+        )
+        var_9 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_5, bias=None, name=None
+        )
+        var_10 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_6, bias=None, name=None
+        )
+        var_11 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_17, bias=None, name=None
+        )
+        var_12 = var_9.reshape([0, 0, 8, 160])
+        var_13 = var_12.transpose([0, 2, 1, 3])
+        var_14 = var_10.reshape([0, 0, 8, 160])
+        var_15 = var_14.transpose([0, 2, 1, 3])
+        var_16 = var_11.reshape([0, 0, 8, 160])
+        var_17 = var_16.transpose([0, 2, 1, 3])
+        var_18 = paddle.tensor.linalg.matmul(var_13, var_15, transpose_y=True)
+        var_19 = var_18 * 0.07905694150420949
+        var_20 = paddle.nn.functional.activation.softmax(var_19, axis=-1)
+        var_21 = paddle.tensor.linalg.matmul(var_20, var_17)
+        var_22 = var_21.transpose([0, 2, 1, 3])
+        var_23 = var_22.reshape([0, 0, 1280])
+        var_24 = paddle.nn.functional.common.linear(
+            x=var_23, weight=self.parameter_13, bias=self.parameter_3, name=None
+        )
+        var_25 = paddle.nn.functional.common.dropout(
+            var_24,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_26 = var_25 / 1.0
+        var_27 = var_26 + var_7
+        var_28 = paddle.nn.functional.norm.layer_norm(
+            var_27,
+            normalized_shape=[1280],
+            weight=self.parameter_11,
+            bias=self.parameter_21,
+            epsilon=1e-05,
+        )
+        var_29 = paddle.nn.functional.common.linear(
+            x=var_28, weight=self.parameter_18, bias=None, name=None
+        )
+        var_30 = paddle.nn.functional.common.linear(
+            x=var_4, weight=self.parameter_15, bias=None, name=None
+        )
+        var_31 = paddle.nn.functional.common.linear(
+            x=var_4, weight=self.parameter_20, bias=None, name=None
+        )
+        var_32 = var_29.reshape([0, 0, 8, 160])
+        var_33 = var_32.transpose([0, 2, 1, 3])
+        var_34 = var_30.reshape([0, 0, 8, 160])
+        var_35 = var_34.transpose([0, 2, 1, 3])
+        var_36 = var_31.reshape([0, 0, 8, 160])
+        var_37 = var_36.transpose([0, 2, 1, 3])
+        var_38 = paddle.tensor.linalg.matmul(var_33, var_35, transpose_y=True)
+        var_39 = var_38 * 0.07905694150420949
+        var_40 = paddle.nn.functional.activation.softmax(var_39, axis=-1)
+        var_41 = paddle.tensor.linalg.matmul(var_40, var_37)
+        var_42 = var_41.transpose([0, 2, 1, 3])
+        var_43 = var_42.reshape([0, 0, 1280])
+        var_44 = paddle.nn.functional.common.linear(
+            x=var_43, weight=self.parameter_0, bias=self.parameter_23, name=None
+        )
+        var_45 = paddle.nn.functional.common.dropout(
+            var_44,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_46 = var_45 / 1.0
+        var_47 = var_46 + var_27
+        var_48 = paddle.nn.functional.norm.layer_norm(
+            var_47,
+            normalized_shape=[1280],
+            weight=self.parameter_7,
+            bias=self.parameter_8,
+            epsilon=1e-05,
+        )
+        var_49 = paddle.nn.functional.common.linear(
+            var_48, self.parameter_22, self.parameter_9
+        )
+        out = var_49.chunk(2, axis=-1)
+        var_50 = out[0]
+        var_51 = out[1]
+        var_52 = paddle.nn.functional.activation.gelu(var_51)
+        var_53 = var_50 * var_52
+        var_54 = paddle.nn.functional.common.dropout(
+            var_53,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_55 = paddle.nn.functional.common.linear(
+            var_54, self.parameter_14, self.parameter_2
+        )
+        var_56 = var_55 + var_47
+        var_57 = var_56.reshape([-1, var_1, var_2, 1280])
+        var_58 = var_57.transpose([0, 3, 1, 2])
+        var_59 = paddle.nn.functional.conv.conv2d(
+            var_58, self.parameter_12, self.parameter_4, [1, 1], 0, [1, 1], 1
+        )
+        var_60 = var_59 + var_3
+        return var_60
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 1280, 1, 1], dtype=paddle.float32),
+        paddle.randint(low=1, high=2, shape=[1], dtype=paddle.int32),
+        paddle.randint(low=1, high=2, shape=[1], dtype=paddle.int32),
+        paddle.rand(shape=[1, 1280, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 4, 768], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=False, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_19_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_19_st.py
new file mode 100644
index 0000000000000..a351ad02840e4
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_19_st.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.linear||method:__getitem__||method:__add__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[1280, 1280, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 1280, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 1280], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.conv.conv2d(
+            var_2, self.parameter_1, self.parameter_3, [1, 1], 1, [1, 1], 1
+        )
+        var_4 = paddle.nn.functional.activation.silu(var_1, None)
+        var_5 = paddle.nn.functional.common.linear(
+            var_4, self.parameter_2, self.parameter_0
+        )
+        var_6 = var_5[
+            (
+                slice(None, None, None),
+                slice(None, None, None),
+                None,
+                None,
+            )
+        ]
+        var_7 = var_3 + var_6
+        return var_7, var_6
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 1280, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 1280], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_20_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_20_st.py
new file mode 100644
index 0000000000000..6a38346b16a3b
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_20_st.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.conv.conv2d||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[1280, 1280, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 1280, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 1280, 1, 1], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.common.dropout(
+            var_2,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_4 = paddle.nn.functional.conv.conv2d(
+            var_3, self.parameter_0, self.parameter_1, [1, 1], 1, [1, 1], 1
+        )
+        var_5 = var_1 + var_4
+        var_6 = var_5 / 1.0
+        return var_6
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 1280, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 1280, 1, 1], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_21_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_21_st.py
new file mode 100644
index 0000000000000..4a038baaf1c14
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_21_st.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.linear||method:__getitem__||method:__add__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[1280, 2560, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 2560, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 1280], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.conv.conv2d(
+            var_2, self.parameter_2, self.parameter_0, [1, 1], 1, [1, 1], 1
+        )
+        var_4 = paddle.nn.functional.activation.silu(var_1, None)
+        var_5 = paddle.nn.functional.common.linear(
+            var_4, self.parameter_3, self.parameter_1
+        )
+        var_6 = var_5[
+            (
+                slice(None, None, None),
+                slice(None, None, None),
+                None,
+                None,
+            )
+        ]
+        var_7 = var_3 + var_6
+        return var_7, var_6
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 2560, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 1280], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9d2d05d4acd35909a20464726f8a5dc01f129c40 Mon Sep 17 00:00:00 2001
From: yulangz <53958801+yulangz@users.noreply.github.com>
Date: Fri, 8 Mar 2024 16:13:16 +0800
Subject: [PATCH 080/114] Add sub graph of stable diffusion-3 (#62511)

---
 .../test_sub_graph_stable_diffusion_13_st.py  | 299 ++++++++++++++++++
 .../test_sub_graph_stable_diffusion_14_st.py  | 110 +++++++
 .../test_sub_graph_stable_diffusion_15_st.py  |  99 ++++++
 .../test_sub_graph_stable_diffusion_16_st.py  | 110 +++++++
 .../test_sub_graph_stable_diffusion_17_st.py  |  79 +++++
 5 files changed, 697 insertions(+)
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_13_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_14_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_15_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_16_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_17_st.py

diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_13_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_13_st.py
new file mode 100644
index 0000000000000..192976b0541ad
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_13_st.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.conv.conv2d||method:transpose||method:flatten||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||method:reshape||method:transpose||method:reshape||method:transpose||method:reshape||method:transpose||api:paddle.tensor.linalg.matmul||method:__mul__||api:paddle.nn.functional.activation.softmax||api:paddle.tensor.linalg.matmul||method:transpose||method:reshape||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.dropout||method:__truediv__||method:__add__||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||method:reshape||method:transpose||method:reshape||method:transpose||method:reshape||method:transpose||api:paddle.tensor.linalg.matmul||method:__mul__||api:paddle.nn.functional.activation.softmax||api:paddle.tensor.linalg.matmul||method:transpose||method:reshape||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.dropout||method:__truediv__||method:__add__||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||method:chunk||api:paddle.nn.functional.activation.gelu||method:__mul__||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.common.linear||method:__add__||method:reshape||method:transpose||api:paddle.nn.functional.conv.conv2d||method:__add__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[640, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[640, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_4 = self.create_parameter(
+            shape=[640, 5120],
+            dtype=paddle.float32,
+        )
+        self.parameter_5 = self.create_parameter(
+            shape=[640, 640, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_6 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_7 = self.create_parameter(
+            shape=[640, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_8 = self.create_parameter(
+            shape=[640, 640, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_9 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_10 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_11 = self.create_parameter(
+            shape=[640, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_12 = self.create_parameter(
+            shape=[640, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_13 = self.create_parameter(
+            shape=[5120],
+            dtype=paddle.float32,
+        )
+        self.parameter_14 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_15 = self.create_parameter(
+            shape=[2560, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_16 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_17 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_18 = self.create_parameter(
+            shape=[640, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_19 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_20 = self.create_parameter(
+            shape=[768, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_21 = self.create_parameter(
+            shape=[768, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_22 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_23 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 640, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [], dtype: paddle.int32, stop_gradient: True)
+        var_2,  # (shape: [], dtype: paddle.int32, stop_gradient: True)
+        var_3,  # (shape: [1, 640, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_4,  # (shape: [1, 4, 768], dtype: paddle.float32, stop_gradient: True)
+    ):
+        var_5 = paddle.nn.functional.conv.conv2d(
+            var_0, self.parameter_8, self.parameter_1, [1, 1], 0, [1, 1], 1
+        )
+        var_6 = var_5.transpose([0, 2, 3, 1])
+        var_7 = var_6.flatten(1, 2)
+        var_8 = paddle.nn.functional.norm.layer_norm(
+            var_7,
+            normalized_shape=[640],
+            weight=self.parameter_17,
+            bias=self.parameter_16,
+            epsilon=1e-05,
+        )
+        var_9 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_12, bias=None, name=None
+        )
+        var_10 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_11, bias=None, name=None
+        )
+        var_11 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_2, bias=None, name=None
+        )
+        var_12 = var_9.reshape([0, 0, 8, 80])
+        var_13 = var_12.transpose([0, 2, 1, 3])
+        var_14 = var_10.reshape([0, 0, 8, 80])
+        var_15 = var_14.transpose([0, 2, 1, 3])
+        var_16 = var_11.reshape([0, 0, 8, 80])
+        var_17 = var_16.transpose([0, 2, 1, 3])
+        var_18 = paddle.tensor.linalg.matmul(var_13, var_15, transpose_y=True)
+        var_19 = var_18 * 0.11180339887498948
+        var_20 = paddle.nn.functional.activation.softmax(var_19, axis=-1)
+        var_21 = paddle.tensor.linalg.matmul(var_20, var_17)
+        var_22 = var_21.transpose([0, 2, 1, 3])
+        var_23 = var_22.reshape([0, 0, 640])
+        var_24 = paddle.nn.functional.common.linear(
+            x=var_23, weight=self.parameter_7, bias=self.parameter_10, name=None
+        )
+        var_25 = paddle.nn.functional.common.dropout(
+            var_24,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_26 = var_25 / 1.0
+        var_27 = var_26 + var_7
+        var_28 = paddle.nn.functional.norm.layer_norm(
+            var_27,
+            normalized_shape=[640],
+            weight=self.parameter_9,
+            bias=self.parameter_3,
+            epsilon=1e-05,
+        )
+        var_29 = paddle.nn.functional.common.linear(
+            x=var_28, weight=self.parameter_0, bias=None, name=None
+        )
+        var_30 = paddle.nn.functional.common.linear(
+            x=var_4, weight=self.parameter_20, bias=None, name=None
+        )
+        var_31 = paddle.nn.functional.common.linear(
+            x=var_4, weight=self.parameter_21, bias=None, name=None
+        )
+        var_32 = var_29.reshape([0, 0, 8, 80])
+        var_33 = var_32.transpose([0, 2, 1, 3])
+        var_34 = var_30.reshape([0, 0, 8, 80])
+        var_35 = var_34.transpose([0, 2, 1, 3])
+        var_36 = var_31.reshape([0, 0, 8, 80])
+        var_37 = var_36.transpose([0, 2, 1, 3])
+        var_38 = paddle.tensor.linalg.matmul(var_33, var_35, transpose_y=True)
+        var_39 = var_38 * 0.11180339887498948
+        var_40 = paddle.nn.functional.activation.softmax(var_39, axis=-1)
+        var_41 = paddle.tensor.linalg.matmul(var_40, var_37)
+        var_42 = var_41.transpose([0, 2, 1, 3])
+        var_43 = var_42.reshape([0, 0, 640])
+        var_44 = paddle.nn.functional.common.linear(
+            x=var_43, weight=self.parameter_18, bias=self.parameter_6, name=None
+        )
+        var_45 = paddle.nn.functional.common.dropout(
+            var_44,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_46 = var_45 / 1.0
+        var_47 = var_46 + var_27
+        var_48 = paddle.nn.functional.norm.layer_norm(
+            var_47,
+            normalized_shape=[640],
+            weight=self.parameter_19,
+            bias=self.parameter_23,
+            epsilon=1e-05,
+        )
+        var_49 = paddle.nn.functional.common.linear(
+            var_48, self.parameter_4, self.parameter_13
+        )
+        out = var_49.chunk(2, axis=-1)
+        var_50 = out[0]
+        var_51 = out[1]
+        var_52 = paddle.nn.functional.activation.gelu(var_51)
+        var_53 = var_50 * var_52
+        var_54 = paddle.nn.functional.common.dropout(
+            var_53,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_55 = paddle.nn.functional.common.linear(
+            var_54, self.parameter_15, self.parameter_14
+        )
+        var_56 = var_55 + var_47
+        var_57 = var_56.reshape([-1, var_1, var_2, 640])
+        var_58 = var_57.transpose([0, 3, 1, 2])
+        var_59 = paddle.nn.functional.conv.conv2d(
+            var_58, self.parameter_5, self.parameter_22, [1, 1], 0, [1, 1], 1
+        )
+        var_60 = var_59 + var_3
+        return var_60
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 640, 1, 1], dtype=paddle.float32),
+        paddle.randint(low=1, high=2, shape=[1], dtype=paddle.int32),
+        paddle.randint(low=1, high=2, shape=[1], dtype=paddle.int32),
+        paddle.rand(shape=[1, 640, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 4, 768], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=False, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_14_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_14_st.py
new file mode 100644
index 0000000000000..bd55b28623939
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_14_st.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.linear||method:__getitem__||method:__add__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[640, 640, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[1280, 640],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 640, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 1280], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.conv.conv2d(
+            var_2, self.parameter_0, self.parameter_1, [1, 1], 1, [1, 1], 1
+        )
+        var_4 = paddle.nn.functional.activation.silu(var_1, None)
+        var_5 = paddle.nn.functional.common.linear(
+            var_4, self.parameter_2, self.parameter_3
+        )
+        var_6 = var_5[
+            (
+                slice(None, None, None),
+                slice(None, None, None),
+                None,
+                None,
+            )
+        ]
+        var_7 = var_3 + var_6
+        return var_7, var_6
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 640, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 1280], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_15_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_15_st.py
new file mode 100644
index 0000000000000..a78f2ea9ee538
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_15_st.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.conv.conv2d||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[640, 640, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 640, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 640, 1, 1], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.common.dropout(
+            var_2,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_4 = paddle.nn.functional.conv.conv2d(
+            var_3, self.parameter_0, self.parameter_1, [1, 1], 1, [1, 1], 1
+        )
+        var_5 = var_1 + var_4
+        var_6 = var_5 / 1.0
+        return var_6
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 640, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 640, 1, 1], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=True
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_16_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_16_st.py
new file mode 100644
index 0000000000000..054418b3f8d01
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_16_st.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.conv.conv2d||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[1280, 1280, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[1280, 640, 1, 1],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 1280, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 640, 1, 1], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.common.dropout(
+            var_2,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_4 = paddle.nn.functional.conv.conv2d(
+            var_3, self.parameter_2, self.parameter_0, [1, 1], 1, [1, 1], 1
+        )
+        var_5 = paddle.nn.functional.conv.conv2d(
+            var_1, self.parameter_3, self.parameter_1, [1, 1], 0, [1, 1], 1
+        )
+        var_6 = var_5 + var_4
+        var_7 = var_6 / 1.0
+        return var_7
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 1280, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 640, 1, 1], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_17_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_17_st.py
new file mode 100644
index 0000000000000..8b1f87d654e62
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_17_st.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# method:cast||api:paddle.tensor.attribute.shape||method:__getitem__||method:__getitem__||method:__getitem__||method:__getitem__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 1280, 1, 1], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_1 = var_0.cast('float32')
+        var_2 = paddle.tensor.attribute.shape(var_1)
+        var_3 = var_2[0]
+        var_4 = var_2[1]
+        var_5 = var_2[2]
+        var_6 = var_2[3]
+        return var_1, var_5, var_6
+
+
+def create_paddle_inputs():
+    inputs = (paddle.rand(shape=[1, 1280, 1, 1], dtype=paddle.float32),)
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 008d0ac49c7d1bd84e43d09aadf2e0306656b414 Mon Sep 17 00:00:00 2001
From: yulangz <53958801+yulangz@users.noreply.github.com>
Date: Fri, 8 Mar 2024 16:13:47 +0800
Subject: [PATCH 081/114] Add sub graph of stable diffusion-2 (#62512)

---
 .../test_sub_graph_stable_diffusion_10_st.py  | 302 ++++++++++++++++++
 .../test_sub_graph_stable_diffusion_11_st.py  | 110 +++++++
 .../test_sub_graph_stable_diffusion_12_st.py  |  79 +++++
 .../test_sub_graph_stable_diffusion_8_st.py   |  99 ++++++
 .../test_sub_graph_stable_diffusion_9_st.py   |  79 +++++
 5 files changed, 669 insertions(+)
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_10_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_11_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_12_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_8_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_9_st.py

diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_10_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_10_st.py
new file mode 100644
index 0000000000000..1a46bae4fba36
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_10_st.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.conv.conv2d||method:transpose||method:flatten||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||method:reshape||method:transpose||method:reshape||method:transpose||method:reshape||method:transpose||api:paddle.tensor.linalg.matmul||method:__mul__||api:paddle.nn.functional.activation.softmax||api:paddle.tensor.linalg.matmul||method:transpose||method:reshape||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.dropout||method:__truediv__||method:__add__||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||method:reshape||method:transpose||method:reshape||method:transpose||method:reshape||method:transpose||api:paddle.tensor.linalg.matmul||method:__mul__||api:paddle.nn.functional.activation.softmax||api:paddle.tensor.linalg.matmul||method:transpose||method:reshape||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.dropout||method:__truediv__||method:__add__||api:paddle.nn.functional.norm.layer_norm||api:paddle.nn.functional.common.linear||method:chunk||api:paddle.nn.functional.activation.gelu||method:__mul__||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.common.linear||method:__add__||method:reshape||method:transpose||api:paddle.nn.functional.conv.conv2d||method:__add__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[320, 320],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[320, 320],
+            dtype=paddle.float32,
+        )
+        self.parameter_4 = self.create_parameter(
+            shape=[768, 320],
+            dtype=paddle.float32,
+        )
+        self.parameter_5 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_6 = self.create_parameter(
+            shape=[2560],
+            dtype=paddle.float32,
+        )
+        self.parameter_7 = self.create_parameter(
+            shape=[320, 320],
+            dtype=paddle.float32,
+        )
+        self.parameter_8 = self.create_parameter(
+            shape=[320, 2560],
+            dtype=paddle.float32,
+        )
+        self.parameter_9 = self.create_parameter(
+            shape=[320, 320, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_10 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_11 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_12 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_13 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_14 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_15 = self.create_parameter(
+            shape=[1280, 320],
+            dtype=paddle.float32,
+        )
+        self.parameter_16 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_17 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_18 = self.create_parameter(
+            shape=[768, 320],
+            dtype=paddle.float32,
+        )
+        self.parameter_19 = self.create_parameter(
+            shape=[320, 320],
+            dtype=paddle.float32,
+        )
+        self.parameter_20 = self.create_parameter(
+            shape=[320, 320],
+            dtype=paddle.float32,
+        )
+        self.parameter_21 = self.create_parameter(
+            shape=[320, 320, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_22 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_23 = self.create_parameter(
+            shape=[320, 320],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 320, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [], dtype: paddle.int32, stop_gradient: True)
+        var_2,  # (shape: [], dtype: paddle.int32, stop_gradient: True)
+        var_3,  # (shape: [1, 320, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_4,  # (shape: [1, 4, 768], dtype: paddle.float32, stop_gradient: True)
+    ):
+        var_5 = paddle.nn.functional.conv.conv2d(
+            var_0, self.parameter_21, self.parameter_17, [1, 1], 0, [1, 1], 1
+        )
+        var_6 = var_5.transpose([0, 2, 3, 1])
+        var_7 = var_6.flatten(1, 2)
+        var_8 = paddle.nn.functional.norm.layer_norm(
+            var_7,
+            normalized_shape=[320],
+            weight=self.parameter_5,
+            bias=self.parameter_10,
+            epsilon=1e-05,
+        )
+        var_9 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_7, bias=None, name=None
+        )
+        var_10 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_3, bias=None, name=None
+        )
+        var_11 = paddle.nn.functional.common.linear(
+            x=var_8, weight=self.parameter_19, bias=None, name=None
+        )
+        var_12 = var_9.reshape([0, 0, 8, 40])
+        var_13 = var_12.transpose([0, 2, 1, 3])
+        var_14 = var_10.reshape([0, 0, 8, 40])
+        var_15 = var_14.transpose([0, 2, 1, 3])
+        var_16 = var_11.reshape([0, 0, 8, 40])
+        var_17 = var_16.transpose([0, 2, 1, 3])
+        var_18 = paddle.tensor.linalg.matmul(var_13, var_15, transpose_y=True)
+        var_19 = var_18 * 0.15811388300841897
+        var_20 = paddle.nn.functional.activation.softmax(var_19, axis=-1)
+        var_21 = paddle.tensor.linalg.matmul(var_20, var_17)
+        var_22 = var_21.transpose([0, 2, 1, 3])
+        var_23 = var_22.reshape([0, 0, 320])
+        var_24 = paddle.nn.functional.common.linear(
+            x=var_23,
+            weight=self.parameter_20,
+            bias=self.parameter_14,
+            name=None,
+        )
+        var_25 = paddle.nn.functional.common.dropout(
+            var_24,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_26 = var_25 / 1.0
+        var_27 = var_26 + var_7
+        var_28 = paddle.nn.functional.norm.layer_norm(
+            var_27,
+            normalized_shape=[320],
+            weight=self.parameter_22,
+            bias=self.parameter_13,
+            epsilon=1e-05,
+        )
+        var_29 = paddle.nn.functional.common.linear(
+            x=var_28, weight=self.parameter_23, bias=None, name=None
+        )
+        var_30 = paddle.nn.functional.common.linear(
+            x=var_4, weight=self.parameter_4, bias=None, name=None
+        )
+        var_31 = paddle.nn.functional.common.linear(
+            x=var_4, weight=self.parameter_18, bias=None, name=None
+        )
+        var_32 = var_29.reshape([0, 0, 8, 40])
+        var_33 = var_32.transpose([0, 2, 1, 3])
+        var_34 = var_30.reshape([0, 0, 8, 40])
+        var_35 = var_34.transpose([0, 2, 1, 3])
+        var_36 = var_31.reshape([0, 0, 8, 40])
+        var_37 = var_36.transpose([0, 2, 1, 3])
+        var_38 = paddle.tensor.linalg.matmul(var_33, var_35, transpose_y=True)
+        var_39 = var_38 * 0.15811388300841897
+        var_40 = paddle.nn.functional.activation.softmax(var_39, axis=-1)
+        var_41 = paddle.tensor.linalg.matmul(var_40, var_37)
+        var_42 = var_41.transpose([0, 2, 1, 3])
+        var_43 = var_42.reshape([0, 0, 320])
+        var_44 = paddle.nn.functional.common.linear(
+            x=var_43, weight=self.parameter_2, bias=self.parameter_0, name=None
+        )
+        var_45 = paddle.nn.functional.common.dropout(
+            var_44,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_46 = var_45 / 1.0
+        var_47 = var_46 + var_27
+        var_48 = paddle.nn.functional.norm.layer_norm(
+            var_47,
+            normalized_shape=[320],
+            weight=self.parameter_12,
+            bias=self.parameter_16,
+            epsilon=1e-05,
+        )
+        var_49 = paddle.nn.functional.common.linear(
+            var_48, self.parameter_8, self.parameter_6
+        )
+        out = var_49.chunk(2, axis=-1)
+        var_50 = out[0]
+        var_51 = out[1]
+        var_52 = paddle.nn.functional.activation.gelu(var_51)
+        var_53 = var_50 * var_52
+        var_54 = paddle.nn.functional.common.dropout(
+            var_53,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_55 = paddle.nn.functional.common.linear(
+            var_54, self.parameter_15, self.parameter_1
+        )
+        var_56 = var_55 + var_47
+        var_57 = var_56.reshape([-1, var_1, var_2, 320])
+        var_58 = var_57.transpose([0, 3, 1, 2])
+        var_59 = paddle.nn.functional.conv.conv2d(
+            var_58, self.parameter_9, self.parameter_11, [1, 1], 0, [1, 1], 1
+        )
+        var_60 = var_59 + var_3
+        return var_60
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 320, 1, 1], dtype=paddle.float32),
+        paddle.randint(low=1, high=2, shape=[1], dtype=paddle.int32),
+        paddle.randint(low=1, high=2, shape=[1], dtype=paddle.int32),
+        paddle.rand(shape=[1, 320, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 4, 768], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=False, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_11_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_11_st.py
new file mode 100644
index 0000000000000..88af233ed678a
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_11_st.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.conv.conv2d||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[640, 320, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[640, 640, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[640],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 640, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 320, 1, 1], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.common.dropout(
+            var_2,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_4 = paddle.nn.functional.conv.conv2d(
+            var_3, self.parameter_2, self.parameter_0, [1, 1], 1, [1, 1], 1
+        )
+        var_5 = paddle.nn.functional.conv.conv2d(
+            var_1, self.parameter_1, self.parameter_3, [1, 1], 0, [1, 1], 1
+        )
+        var_6 = var_5 + var_4
+        var_7 = var_6 / 1.0
+        return var_7
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 640, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 320, 1, 1], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_12_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_12_st.py
new file mode 100644
index 0000000000000..c00bc83ec80af
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_12_st.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# method:cast||api:paddle.tensor.attribute.shape||method:__getitem__||method:__getitem__||method:__getitem__||method:__getitem__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 640, 1, 1], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_1 = var_0.cast('float32')
+        var_2 = paddle.tensor.attribute.shape(var_1)
+        var_3 = var_2[0]
+        var_4 = var_2[1]
+        var_5 = var_2[2]
+        var_6 = var_2[3]
+        return var_1, var_5, var_6
+
+
+def create_paddle_inputs():
+    inputs = (paddle.rand(shape=[1, 640, 1, 1], dtype=paddle.float32),)
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_8_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_8_st.py
new file mode 100644
index 0000000000000..5cef564d61a46
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_8_st.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.conv.conv2d||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[320, 320, 3, 3],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 320, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 320, 1, 1], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.common.dropout(
+            var_2,
+            p=0.0,
+            axis=None,
+            training=True,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_4 = paddle.nn.functional.conv.conv2d(
+            var_3, self.parameter_1, self.parameter_0, [1, 1], 1, [1, 1], 1
+        )
+        var_5 = var_1 + var_4
+        var_6 = var_5 / 1.0
+        return var_6
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 320, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 320, 1, 1], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=True
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_9_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_9_st.py
new file mode 100644
index 0000000000000..a03d352478fe1
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_9_st.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# method:cast||api:paddle.tensor.attribute.shape||method:__getitem__||method:__getitem__||method:__getitem__||method:__getitem__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 320, 1, 1], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_1 = var_0.cast('float32')
+        var_2 = paddle.tensor.attribute.shape(var_1)
+        var_3 = var_2[0]
+        var_4 = var_2[1]
+        var_5 = var_2[2]
+        var_6 = var_2[3]
+        return var_1, var_5, var_6
+
+
+def create_paddle_inputs():
+    inputs = (paddle.rand(shape=[1, 320, 1, 1], dtype=paddle.float32),)
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1e3e19f6de94edf461fe7d6a31d8d2825fc55d96 Mon Sep 17 00:00:00 2001
From: yulangz <53958801+yulangz@users.noreply.github.com>
Date: Fri, 8 Mar 2024 16:14:31 +0800
Subject: [PATCH 082/114] Add sub graph of stable diffusion-1 (#62513)

---
 .../test_sub_graph_stable_diffusion_0_st.py   | 110 +++++++++++++
 .../test_sub_graph_stable_diffusion_1_st.py   | 110 +++++++++++++
 .../test_sub_graph_stable_diffusion_2_st.py   | 148 ++++++++++++++++++
 .../test_sub_graph_stable_diffusion_3_st.py   |  80 ++++++++++
 .../test_sub_graph_stable_diffusion_4_st.py   | 102 ++++++++++++
 .../test_sub_graph_stable_diffusion_5_st.py   | 108 +++++++++++++
 .../test_sub_graph_stable_diffusion_6_st.py   |  96 ++++++++++++
 .../test_sub_graph_stable_diffusion_7_st.py   | 110 +++++++++++++
 8 files changed, 864 insertions(+)
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_0_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_1_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_2_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_3_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_4_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_5_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_6_st.py
 create mode 100644 test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_7_st.py

diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_0_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_0_st.py
new file mode 100644
index 0000000000000..0ab3a26743218
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_0_st.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.conv.conv2d||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[256, 128, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[256],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[256],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[256, 256, 3, 3],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 256, 4, 4], dtype: paddle.float32, stop_gradient: True)
+        var_1,  # (shape: [1, 128, 4, 4], dtype: paddle.float32, stop_gradient: True)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.common.dropout(
+            var_2,
+            p=0.0,
+            axis=None,
+            training=False,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_4 = paddle.nn.functional.conv.conv2d(
+            var_3, self.parameter_3, self.parameter_2, [1, 1], 1, [1, 1], 1
+        )
+        var_5 = paddle.nn.functional.conv.conv2d(
+            var_1, self.parameter_0, self.parameter_1, [1, 1], 0, [1, 1], 1
+        )
+        var_6 = var_5 + var_4
+        var_7 = var_6 / 1.0
+        return var_7
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 256, 4, 4], dtype=paddle.float32),
+        paddle.rand(shape=[1, 128, 4, 4], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_1_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_1_st.py
new file mode 100644
index 0000000000000..d953b6ccd0669
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_1_st.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.dropout||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.conv.conv2d||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[512],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[512, 512, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[512, 256, 1, 1],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[512],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 512, 2, 2], dtype: paddle.float32, stop_gradient: True)
+        var_1,  # (shape: [1, 256, 2, 2], dtype: paddle.float32, stop_gradient: True)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.common.dropout(
+            var_2,
+            p=0.0,
+            axis=None,
+            training=False,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_4 = paddle.nn.functional.conv.conv2d(
+            var_3, self.parameter_1, self.parameter_3, [1, 1], 1, [1, 1], 1
+        )
+        var_5 = paddle.nn.functional.conv.conv2d(
+            var_1, self.parameter_2, self.parameter_0, [1, 1], 0, [1, 1], 1
+        )
+        var_6 = var_5 + var_4
+        var_7 = var_6 / 1.0
+        return var_7
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 512, 2, 2], dtype=paddle.float32),
+        paddle.rand(shape=[1, 256, 2, 2], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_2_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_2_st.py
new file mode 100644
index 0000000000000..16363441da9c3
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_2_st.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# method:transpose||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.linear||method:reshape||method:transpose||method:reshape||method:transpose||method:reshape||method:transpose||api:paddle.tensor.linalg.matmul||method:__mul__||method:cast||api:paddle.nn.functional.activation.softmax||method:cast||api:paddle.tensor.linalg.matmul||method:transpose||method:reshape||api:paddle.nn.functional.common.linear||api:paddle.nn.functional.common.dropout||method:transpose||method:reshape||method:__add__||method:__truediv__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[512, 512],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[512],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[512, 512],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[512, 512],
+            dtype=paddle.float32,
+        )
+        self.parameter_4 = self.create_parameter(
+            shape=[512],
+            dtype=paddle.float32,
+        )
+        self.parameter_5 = self.create_parameter(
+            shape=[512, 512],
+            dtype=paddle.float32,
+        )
+        self.parameter_6 = self.create_parameter(
+            shape=[512],
+            dtype=paddle.float32,
+        )
+        self.parameter_7 = self.create_parameter(
+            shape=[512],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 512, 1], dtype: paddle.float32, stop_gradient: True)
+        var_1,  # (shape: [1, 512, 1, 1], dtype: paddle.float32, stop_gradient: True)
+    ):
+        var_2 = var_0.transpose([0, 2, 1])
+        var_3 = paddle.nn.functional.common.linear(
+            x=var_2, weight=self.parameter_0, bias=self.parameter_6, name=None
+        )
+        var_4 = paddle.nn.functional.common.linear(
+            x=var_2, weight=self.parameter_2, bias=self.parameter_1, name=None
+        )
+        var_5 = paddle.nn.functional.common.linear(
+            x=var_2, weight=self.parameter_5, bias=self.parameter_4, name=None
+        )
+        var_6 = var_3.reshape([0, 0, 1, 512])
+        var_7 = var_6.transpose([0, 2, 1, 3])
+        var_8 = var_4.reshape([0, 0, 1, 512])
+        var_9 = var_8.transpose([0, 2, 1, 3])
+        var_10 = var_5.reshape([0, 0, 1, 512])
+        var_11 = var_10.transpose([0, 2, 1, 3])
+        var_12 = paddle.tensor.linalg.matmul(var_7, var_9, transpose_y=True)
+        var_13 = var_12 * 0.04419417382415922
+        var_14 = var_13.cast('float32')
+        var_15 = paddle.nn.functional.activation.softmax(var_14, axis=-1)
+        var_16 = var_15.cast('float32')
+        var_17 = paddle.tensor.linalg.matmul(var_16, var_11)
+        var_18 = var_17.transpose([0, 2, 1, 3])
+        var_19 = var_18.reshape([0, 0, 512])
+        var_20 = paddle.nn.functional.common.linear(
+            x=var_19, weight=self.parameter_3, bias=self.parameter_7, name=None
+        )
+        var_21 = paddle.nn.functional.common.dropout(
+            var_20,
+            p=0.0,
+            axis=None,
+            training=False,
+            mode='upscale_in_train',
+            name=None,
+        )
+        var_22 = var_21.transpose([0, 2, 1])
+        var_23 = var_22.reshape([1, 512, 1, 1])
+        var_24 = var_23 + var_1
+        var_25 = var_24 / 1
+        return var_25
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 512, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 512, 1, 1], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_3_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_3_st.py
new file mode 100644
index 0000000000000..4c292c0741358
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_3_st.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.tensor.manipulation.chunk||api:paddle.tensor.math.clip||method:__rmul__||api:paddle.tensor.ops.exp||api:paddle.tensor.ops.exp
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 8, 1, 1], dtype: paddle.float32, stop_gradient: True)
+    ):
+        out = paddle.tensor.manipulation.chunk(var_0, 2, axis=1)
+        var_1 = out[0]
+        var_2 = out[1]
+        var_3 = paddle.tensor.math.clip(var_2, -30.0, 20.0)
+        var_4 = 0.5 * var_3
+        var_5 = paddle.tensor.ops.exp(var_4)
+        var_6 = paddle.tensor.ops.exp(var_3)
+        return var_1, var_2, var_3, var_5, var_6
+
+
+def create_paddle_inputs():
+    inputs = (paddle.rand(shape=[1, 8, 1, 1], dtype=paddle.float32),)
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=True
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_4_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_4_st.py
new file mode 100644
index 0000000000000..034833070e33f
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_4_st.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.randn||method:__mul__||method:__add__||method:__mul__||api:paddle.randn||api:paddle.randint||method:cast||method:__getitem__||method:__pow__||method:flatten||method:unsqueeze||method:unsqueeze||method:unsqueeze||method:__getitem__||method:__rsub__||method:__pow__||method:flatten||method:unsqueeze||method:unsqueeze||method:unsqueeze||method:__mul__||method:__mul__||method:__add__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 4, 1, 1], dtype: paddle.float32, stop_gradient: True)
+        var_1,  # (shape: [1, 4, 1, 1], dtype: paddle.float32, stop_gradient: True)
+        var_2,  # (shape: [1000], dtype: paddle.float32, stop_gradient: True)
+    ):
+        var_3 = paddle.randn([1, 4, 1, 1], dtype='float32')
+        var_4 = var_1 * var_3
+        var_5 = var_0 + var_4
+        var_6 = var_5 * 0.18215
+        var_7 = paddle.randn([1, 4, 1, 1])
+        var_8 = paddle.randint(0, 1000, (1,))
+        var_9 = var_8.cast('int64')
+        var_10 = var_2[var_9]
+        var_11 = var_10**0.5
+        var_12 = var_11.flatten()
+        var_13 = var_12.unsqueeze(-1)
+        var_14 = var_13.unsqueeze(-1)
+        var_15 = var_14.unsqueeze(-1)
+        var_16 = var_2[var_9]
+        var_17 = 1 - var_16
+        var_18 = var_17**0.5
+        var_19 = var_18.flatten()
+        var_20 = var_19.unsqueeze(-1)
+        var_21 = var_20.unsqueeze(-1)
+        var_22 = var_21.unsqueeze(-1)
+        var_23 = var_15 * var_6
+        var_24 = var_22 * var_7
+        var_25 = var_23 + var_24
+        return var_25, var_9, var_6, var_7
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 4, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 4, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1000], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_5_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_5_st.py
new file mode 100644
index 0000000000000..183a39d8dc9ed
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_5_st.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.tensor.creation.arange||method:__rmul__||method:__truediv__||api:paddle.tensor.ops.exp||method:__getitem__||method:cast||method:__getitem__||method:__mul__||method:__rmul__||api:paddle.tensor.ops.sin||api:paddle.tensor.ops.cos||api:paddle.tensor.manipulation.concat||method:__getitem__||method:__getitem__||api:paddle.tensor.manipulation.concat
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        var_0,  # (shape: [1], dtype: paddle.int64, stop_gradient: True)
+    ):
+        var_1 = paddle.tensor.creation.arange(start=0, end=160, dtype='float32')
+        var_2 = -9.210340371976184 * var_1
+        var_3 = var_2 / 160
+        var_4 = paddle.tensor.ops.exp(var_3)
+        var_5 = var_0[
+            (
+                slice(None, None, None),
+                None,
+            )
+        ]
+        var_6 = var_5.cast('float32')
+        var_7 = var_4[
+            (
+                None,
+                slice(None, None, None),
+            )
+        ]
+        var_8 = var_6 * var_7
+        var_9 = 1 * var_8
+        var_10 = paddle.tensor.ops.sin(var_9)
+        var_11 = paddle.tensor.ops.cos(var_9)
+        var_12 = paddle.tensor.manipulation.concat([var_10, var_11], axis=-1)
+        var_13 = var_12[
+            (
+                slice(None, None, None),
+                slice(160, None, None),
+            )
+        ]
+        var_14 = var_12[
+            (
+                slice(None, None, None),
+                slice(None, 160, None),
+            )
+        ]
+        var_15 = paddle.tensor.manipulation.concat([var_13, var_14], axis=-1)
+        return var_15
+
+
+def create_paddle_inputs():
+    inputs = (paddle.randint(low=0, high=10, shape=[1], dtype=paddle.int64),)
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_6_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_6_st.py
new file mode 100644
index 0000000000000..825734b969840
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_6_st.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.common.linear||api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.linear
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[1280, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[320, 1280],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[1280],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 320], dtype: paddle.float32, stop_gradient: True)
+    ):
+        var_1 = paddle.nn.functional.common.linear(
+            x=var_0, weight=self.parameter_2, bias=self.parameter_0, name=None
+        )
+        var_2 = paddle.nn.functional.activation.silu(var_1, None)
+        var_3 = paddle.nn.functional.common.linear(
+            x=var_2, weight=self.parameter_1, bias=self.parameter_3, name=None
+        )
+        return var_3
+
+
+def create_paddle_inputs():
+    inputs = (paddle.rand(shape=[1, 320], dtype=paddle.float32),)
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_7_st.py b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_7_st.py
new file mode 100644
index 0000000000000..fdff13f8f1b29
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_sub_graph_stable_diffusion_7_st.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# repo: diffusers_sub_grpah
+# model: stable_diffusion
+# api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.conv.conv2d||api:paddle.nn.functional.activation.silu||api:paddle.nn.functional.common.linear||method:__getitem__||method:__add__
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class LayerCase(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.parameter_0 = self.create_parameter(
+            shape=[320, 320, 3, 3],
+            dtype=paddle.float32,
+        )
+        self.parameter_1 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_2 = self.create_parameter(
+            shape=[320],
+            dtype=paddle.float32,
+        )
+        self.parameter_3 = self.create_parameter(
+            shape=[1280, 320],
+            dtype=paddle.float32,
+        )
+
+    def forward(
+        self,
+        var_0,  # (shape: [1, 320, 1, 1], dtype: paddle.float32, stop_gradient: False)
+        var_1,  # (shape: [1, 1280], dtype: paddle.float32, stop_gradient: False)
+    ):
+        var_2 = paddle.nn.functional.activation.silu(var_0, None)
+        var_3 = paddle.nn.functional.conv.conv2d(
+            var_2, self.parameter_0, self.parameter_2, [1, 1], 1, [1, 1], 1
+        )
+        var_4 = paddle.nn.functional.activation.silu(var_1, None)
+        var_5 = paddle.nn.functional.common.linear(
+            var_4, self.parameter_3, self.parameter_1
+        )
+        var_6 = var_5[
+            (
+                slice(None, None, None),
+                slice(None, None, None),
+                None,
+                None,
+            )
+        ]
+        var_7 = var_3 + var_6
+        return var_7, var_6
+
+
+def create_paddle_inputs():
+    inputs = (
+        paddle.rand(shape=[1, 320, 1, 1], dtype=paddle.float32),
+        paddle.rand(shape=[1, 1280], dtype=paddle.float32),
+    )
+    return inputs
+
+
+class TestLayer(unittest.TestCase):
+    def setUp(self):
+        self.inputs = create_paddle_inputs()
+        self.net = LayerCase()
+
+    def train(self, net, to_static, with_prim=False, with_cinn=False):
+        if to_static:
+            paddle.set_flags({'FLAGS_prim_all': with_prim})
+            if with_cinn:
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.build_cinn_pass = True
+                net = paddle.jit.to_static(
+                    net, build_strategy=build_strategy, full_graph=True
+                )
+            else:
+                net = paddle.jit.to_static(net, full_graph=True)
+        paddle.seed(123)
+        outs = net(*self.inputs)
+        return outs
+
+    def test_ast_prim_cinn(self):
+        st_out = self.train(self.net, to_static=True)
+        cinn_out = self.train(
+            self.net, to_static=True, with_prim=True, with_cinn=False
+        )
+        for st, cinn in zip(
+            paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
+        ):
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+
+
+if __name__ == '__main__':
+    unittest.main()

From c8cd35dbb7af8d2593e9ccd53018678441b9b94f Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Fri, 8 Mar 2024 16:17:26 +0800
Subject: [PATCH 083/114] cinn(dynamic): fix reshape op when accessing shape
 dialect across fusion op (#62503)

---
 .../transforms/cinn_group_cluster_pass.cc     |  4 +
 .../transforms/dynamic_reshape_pass.cc        |  3 +-
 .../hlir/framework/pir/op_lowering_impl.cc    |  2 +-
 paddle/cinn/hlir/framework/pir/utils.cc       |  5 +-
 paddle/cinn/hlir/op/elementwise.cc            | 79 +++++++++++++++++--
 paddle/pir/include/core/builtin_op.h          |  2 +
 6 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
index 542f73cb0811e..05268617ba149 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
@@ -353,6 +353,10 @@ ::pir::Operation* ReplaceWithGroupOp(
 bool CanFuse(const GroupClusterNode& first,
              const GroupClusterNode& second,
              ScheduleInfoNode* sch_node) {
+  if (!first.ops.empty() &&
+      (first.ops.front()->name() == "cinn_op.generate_shape")) {
+    return true;
+  }
   if ((second.ops.size() == 1) &&
       (second.ops.front()->name() == "cinn_op.reshape") &&
       (IsLastReshape(second.ops.front()))) {
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
index 834412f83364f..18aa1cf69003d 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/dynamic_reshape_pass.cc
@@ -120,7 +120,8 @@ class DynamicReshapeOpPass : public pir::PatternRewritePass {
 
   pir::RewritePatternSet InitializePatterns(pir::IrContext* context) override {
     pir::RewritePatternSet ps(context);
-    ps.Add<DynamicReshapeOpPattern>(context);
+    // Comment out the DynamicReshapeOpPattern to use pd_op.reshape in
+    // cinn.group ps.Add<DynamicReshapeOpPattern>(context);
     ps.Add<DynamicSqueezeOpPattern>(context);
     ps.Add<DynamicUnsqueezeOpPattern>(context);
     return ps;
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index 466733491cea7..db489a190ff1b 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -865,7 +865,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
             ir::_Var_::Make(symbol_name, cinn::common::Int(64)));
         group->int_args_map[non_tensor_arg_idx++] = {tensor_arg_idx,
                                                      tensor_arg_dim_idx};
-        VLOG(4) << "device kernel func's " << non_tensor_arg_idx << " is from "
+        VLOG(4) << "device kernel func's " << symbol_name << " is from "
                 << tensor_arg_idx << ".shape(" << tensor_arg_dim_idx << ")";
       }
     }
diff --git a/paddle/cinn/hlir/framework/pir/utils.cc b/paddle/cinn/hlir/framework/pir/utils.cc
index 741c81d46463f..78b79f47d803e 100644
--- a/paddle/cinn/hlir/framework/pir/utils.cc
+++ b/paddle/cinn/hlir/framework/pir/utils.cc
@@ -347,7 +347,6 @@ const std::unordered_set<std::string> TOCINN_OPS = {
     PD_OP_NAME(ProdOp),
     PD_OP_NAME(PowOp),
     PD_OP_NAME(ScaleOp),
-    PD_OP_NAME(ReshapeOp),
     PD_OP_NAME(Pool2dOp),
     PD_OP_NAME(IscloseOp),
     PD_OP_NAME(SliceOp),
@@ -512,7 +511,9 @@ utils::AttributeMap CompatibleInfo::ConvertAttributes(
   utils::AttributeMap dst_attrs;
   for (auto& item : src_attrs) {
     VLOG(4) << "deal with " << item.first;
-    if (item.first == ::pir::kStopGradientAttrName) {
+    if (item.first == ::pir::kStopGradientAttrName ||
+        item.first == ::pir::kOutputDimExprs ||
+        item.first == ::pir::kSymbolBindings) {
       continue;
     } else if (item.second.isa<paddle::dialect::PlaceAttribute>()) {
       auto is_cpu =
diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
index 0f39d26b49d92..fc93d9f206684 100644
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -18,6 +18,7 @@
 
 #include "absl/types/optional.h"
 #include "paddle/cinn/adt/op_equation_context.h"
+#include "paddle/cinn/common/type.h"
 #include "paddle/cinn/hlir/framework/node.h"
 #include "paddle/cinn/hlir/framework/op.h"
 #include "paddle/cinn/hlir/framework/op_strategy.h"
@@ -25,6 +26,7 @@
 #include "paddle/cinn/hlir/pe/ir_schedule_pe.h"
 #include "paddle/cinn/hlir/pe/nn.h"
 #include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/utils/functional.h"
 
@@ -1015,16 +1017,19 @@ std::shared_ptr<OpStrategy> StrategyForReshapeSymbolic(
     Expr A = pack_args[0];
     CHECK(A.as_tensor());
     CHECK(!output_shapes.empty());
-    auto attr_store = attrs.attr_store;
-    CHECK(attr_store.count("shape")) << "find no attr of shape";
     auto tensor_A = A.as_tensor_ref();
-    auto stages = CreateStages({tensor_A});
+    auto stages = CreateStages({});
     VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
             << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
 
-    CHECK_EQ(pack_args.size(), 2);
-    CHECK(pack_args[1].is_string());
-    std::string tensor_name = pack_args[1].operator std::string();
+    std::string tensor_name;
+    if (pack_args.size() == 4) {
+      CHECK(pack_args[2].is_string());
+      tensor_name = pack_args[2].operator std::string();
+    } else {
+      CHECK(pack_args[1].is_string());
+      tensor_name = pack_args[1].operator std::string();
+    }
 
     ir::Tensor out = pe::Reshape(tensor_A, output_shapes[0], tensor_name);
     std::vector<CINNValue> res;
@@ -1243,6 +1248,52 @@ std::shared_ptr<framework::OpStrategy> StrategyForYieldStoreSymbolic(
   return strategy;
 }
 
+std::shared_ptr<framework::OpStrategy> StrategyForGenerateShapeSymbolic(
+    const framework::NodeAttr &attrs,
+    const std::vector<ir::Tensor> &inputs,
+    const std::vector<Type> &out_type,
+    const std::vector<std::vector<ir::Dim>> &output_shapes,
+    const Target &target) {
+  framework::CINNCompute cast_compute(
+      [=](lang::Args args, lang::RetValue *ret) {
+        CHECK(!args.empty())
+            << "The input arguments of Cast compute is empty! Please check.\n";
+        CINNValuePack pack_args = args[0];
+        CHECK_GE(pack_args.size(), 1U)
+            << "at least 1 input tensors for Cast compute\n";
+        Expr A = pack_args[0];
+        CHECK(A.as_tensor());
+        CHECK(!output_shapes.empty());
+        auto tensor_A = A.as_tensor_ref();
+        auto stages = CreateStages({tensor_A});
+        VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
+                << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
+        CHECK_EQ(pack_args.size(), 2U);
+        std::string tensor_name = pack_args[1].operator std::string();
+        ir::Tensor out(ir::_Tensor_::Make(/*name=*/tensor_name,
+                                          /*dtype=*/tensor_A->type(),
+                                          /*shape=*/
+                                          {
+                                              Expr(1),
+                                          },
+                                          /*domain=*/
+                                          {
+                                              Expr(1),
+                                          }));
+        std::vector<CINNValue> res;
+        stages->InsertLazily(out);
+        res.push_back(CINNValue(out));
+        CHECK(!out_type.empty())
+            << "Output type of Cast is empty! Please check.\n";
+        res.push_back(CINNValue(stages));
+        *ret = CINNValuePack{res};
+      });
+
+  auto strategy = std::make_shared<framework::OpStrategy>();
+  strategy->AddImpl(cast_compute, lang::PackedFunc(), "strategy.store.x86", 1);
+  return strategy;
+}
+
 std::vector<Type> InferDtypeForCast(const std::vector<Type> &inputs_type,
                                     const framework::AttrMapType &attrs) {
   CHECK(attrs.count("dtype"));
@@ -1584,6 +1635,22 @@ CINN_REGISTER_HELPER(elementwise_ops) {
           "OpPattern", cinn::hlir::framework::OpPatternKind::kElementWise)
       .set_support_level(4);
 
+  CINN_REGISTER_OP(generate_shape)
+      .describe("This operator is used to cast input tensor's type to target.")
+      .set_num_inputs(1)
+      .set_num_outputs(1)
+      .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(
+          "CINNStrategySymbolic",
+          cinn::hlir::op::StrategyForGenerateShapeSymbolic)
+      .set_attr("infershape",
+                MakeOpFunction(cinn::hlir::op::InferShapeForElementwise))
+      .set_attr("inferdtype", MakeOpFunction(cinn::hlir::op::InferDtypeForCast))
+      .set_attr("inferlayout",
+                MakeOpFunction(cinn::hlir::op::InferLayoutForElementwise))
+      .set_attr<cinn::hlir::framework::OpPatternKind>(
+          "OpPattern", cinn::hlir::framework::OpPatternKind::kNonFusible)
+      .set_support_level(4);
+
   CINN_REGISTER_OP(arange)
       .describe("Returns evenly spaced values within a given interval.")
       .set_num_inputs(0)
diff --git a/paddle/pir/include/core/builtin_op.h b/paddle/pir/include/core/builtin_op.h
index add3e6a6a312d..f723eaa96b138 100644
--- a/paddle/pir/include/core/builtin_op.h
+++ b/paddle/pir/include/core/builtin_op.h
@@ -23,6 +23,8 @@ namespace pir {
 class Program;
 class Block;
 constexpr char kStopGradientAttrName[] = "stop_gradient";
+constexpr char kOutputDimExprs[] = "output_dim_exprs";
+constexpr char kSymbolBindings[] = "symbol_bindings";
 ///
 /// \brief ModuleOp
 ///

From 98aa58f8670ac06d59e08f835c77cf8a0c3157e6 Mon Sep 17 00:00:00 2001
From: wentao yu <yuwentao126@126.com>
Date: Fri, 8 Mar 2024 19:47:15 +0800
Subject: [PATCH 084/114] [DistDialect] add ShardTensor op (#62433)

* add shard_tensor_op

* update ut

* remove useless log and header file

* fix review comments
---
 .../dialect/distributed/ir/dist_dialect.cc    |   2 +
 .../pir/dialect/distributed/ir/dist_op.cc     | 169 ++++++++++++++++++
 .../pir/dialect/distributed/ir/dist_op.h      |  42 +++++
 test/cpp/pir/distributed/dist_dialect_test.cc | 164 +++++++++++++++++
 4 files changed, 377 insertions(+)
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
 create mode 100644 paddle/fluid/pir/dialect/distributed/ir/dist_op.h

diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
index 7258a15b09816..4795b09b936e5 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/attribute_storage.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_op.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/type_storage.h"
 
@@ -32,6 +33,7 @@ void DistDialect::initialize() {
                      TensorDistAttribute,
                      OperationDistAttribute>();
   RegisterTypes<DistDenseTensorType>();
+  RegisterOps<ShardTensorOp>();
 }
 
 void DistDialect::PrintType(pir::Type type, std::ostream &os) const {
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
new file mode 100644
index 0000000000000..97bf0ce6ea122
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_op.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/pir/include/core/builtin_attribute.h"
+#include "paddle/pir/include/core/ir_context.h"
+
+namespace paddle {
+namespace dialect {
+
+const char* ShardTensorOp::attributes_name[1] = {"op_dist_attr"};
+
+void ShardTensorOp::VerifySig() {
+  VLOG(4)
+      << "Start Verifying inputs, outputs and attributes for: ShardTensorOp.";
+  VLOG(4) << "Verifying inputs:";
+  {
+    auto input_size = num_operands();
+    PADDLE_ENFORCE_EQ(
+        input_size,
+        1u,
+        phi::errors::PreconditionNotMet(
+            "The size %d of inputs must be equal to 1.", input_size));
+    PADDLE_ENFORCE((*this)
+                       ->operand_source(0)
+                       .type()
+                       .isa<paddle::dialect::DenseTensorType>(),
+                   phi::errors::PreconditionNotMet(
+                       "Type validation failed for the 0th input."));
+  }
+  VLOG(4) << "Verifying attributes:";
+  {
+    auto& attributes = this->attributes();
+    PADDLE_ENFORCE(attributes.count("op_dist_attr") > 0 &&
+                       attributes.at("op_dist_attr")
+                           .isa<paddle::dialect::OperationDistAttribute>(),
+                   phi::errors::PreconditionNotMet(
+                       "Type of attribute: op_dist_attr is not right."));
+  }
+  VLOG(4) << "Verifying outputs:";
+  {
+    auto output_size = num_results();
+    PADDLE_ENFORCE_EQ(
+        output_size,
+        1u,
+        phi::errors::PreconditionNotMet(
+            "The size %d of outputs must be equal to 1.", output_size));
+    PADDLE_ENFORCE(
+        (*this)->result(0).type().isa<paddle::dialect::DistDenseTensorType>(),
+        phi::errors::PreconditionNotMet(
+            "Type validation failed for the 0th output."));
+  }
+  VLOG(4) << "Verifying op dist attrs:";
+  {
+    auto op_dist_attr =
+        this->attribute<paddle::dialect::OperationDistAttribute>(
+            "op_dist_attr");
+    PADDLE_ENFORCE_EQ(op_dist_attr.num_operand_dist_attrs(),
+                      0u,
+                      phi::errors::PreconditionNotMet(
+                          "The op_dist_attr input size %d must be equal to 0.",
+                          op_dist_attr.num_operand_dist_attrs()));
+
+    PADDLE_ENFORCE_EQ(
+        op_dist_attr.num_result_dist_attrs(),
+        num_results(),
+        phi::errors::PreconditionNotMet("The op_dist_attr output size %d must "
+                                        "be equal to op output size %d.",
+                                        op_dist_attr.num_result_dist_attrs(),
+                                        num_results()));
+  }
+  VLOG(4) << "End Verifying for: ShardTensorOp.";
+}
+
+void ShardTensorOp::Build(pir::Builder& builder,
+                          pir::OperationArgument& argument,
+                          pir::Value input,
+                          pir::AttributeMap attributes) {
+  VLOG(4) << "Start build ShardOp";
+  // Temporary restriction, will support input use_empty false in the future
+  PADDLE_ENFORCE_EQ(
+      input.use_empty(),
+      true,
+      phi::errors::PreconditionNotMet("'input' use_empty is not true"));
+
+  paddle::dialect::DenseTensorType input_tensor_type;
+  if (input.type().isa<paddle::dialect::DenseTensorType>()) {
+    input_tensor_type =
+        input.type().dyn_cast<paddle::dialect::DenseTensorType>();
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Only support paddle::dialect::DenseTensorType"));
+  }
+
+  PADDLE_ENFORCE(attributes.find("tensor_dist_attr") != attributes.end(),
+                 phi::errors::NotFound(
+                     "'tensor_dist_attr' Attribute is expected for ShardOp"));
+  paddle::dialect::TensorDistAttribute tensor_dist_attr =
+      attributes.at("tensor_dist_attr")
+          .dyn_cast<paddle::dialect::TensorDistAttribute>();
+
+  VLOG(4) << "Builder construction inputs";
+  argument.AddInput(input);
+
+  VLOG(4) << "Builder construction attributes";
+  auto process_mesh_attr = tensor_dist_attr.process_mesh_attr();
+  auto dims_mapping = tensor_dist_attr.dims_mapping();
+
+  pir::Attribute op_dist_attr = OperationDistAttribute::get(
+      pir::IrContext::Instance(),
+      process_mesh_attr,
+      std::vector<TensorDistAttribute>(),
+      std::vector<TensorDistAttribute>{tensor_dist_attr});
+  argument.AddAttribute("op_dist_attr", op_dist_attr);
+
+  VLOG(4) << "Builder construction outputs";
+  auto global_dims = input_tensor_type.dims();
+  auto process_mesh_shape = process_mesh_attr.shape();
+  PADDLE_ENFORCE(static_cast<int>(dims_mapping.size()) == global_dims.size(),
+                 phi::errors::PreconditionNotMet(
+                     "dims_mapping size %d does not match input size %d",
+                     dims_mapping.size(),
+                     global_dims.size()));
+  std::vector<int> local_shape(global_dims.size());
+  for (int i = 0; i < global_dims.size(); ++i) {
+    if (dims_mapping[i] == -1) {
+      local_shape[i] = global_dims[i];
+    } else {
+      auto shard_size = process_mesh_shape[dims_mapping[i]];
+      PADDLE_ENFORCE(
+          global_dims[i] % shard_size == 0,
+          phi::errors::PreconditionNotMet(
+              "global_dims size %d can't be evenly devided by shard_size %d",
+              global_dims[i],
+              shard_size));
+      local_shape[i] = global_dims[i] / shard_size;
+    }
+  }
+
+  pir::Type out_dist_tensor_type =
+      paddle::dialect::DistDenseTensorType::get(pir::IrContext::Instance(),
+                                                input_tensor_type,
+                                                tensor_dist_attr,
+                                                phi::make_ddim(local_shape));
+  argument.AddOutput(out_dist_tensor_type);
+}
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::ShardTensorOp)
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_op.h b/paddle/fluid/pir/dialect/distributed/ir/dist_op.h
new file mode 100644
index 0000000000000..f8f79cbed6904
--- /dev/null
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_op.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/pir/include/core/builder.h"
+#include "paddle/pir/include/core/builtin_type.h"
+#include "paddle/pir/include/core/op_base.h"
+#include "paddle/pir/include/core/operation_utils.h"
+
+namespace paddle {
+namespace dialect {
+class ShardTensorOp : public pir::Op<ShardTensorOp> {
+ public:
+  using Op::Op;
+  static const char* name() { return "dist_op.shard_tensor"; }
+  static const char* attributes_name[1];
+  static constexpr uint32_t attributes_num = 1;
+  TEST_API static void Build(pir::Builder& builder,             // NOLINT
+                             pir::OperationArgument& argument,  // NOLINT
+                             pir::Value input,
+                             pir::AttributeMap attributes);
+  pir::Value input() { return operand_source(0); }
+  pir::Value out() { return result(0); }
+  void VerifySig();
+};
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::ShardTensorOp)
diff --git a/test/cpp/pir/distributed/dist_dialect_test.cc b/test/cpp/pir/distributed/dist_dialect_test.cc
index 31bf69ea77030..5bc6df02ce2b9 100644
--- a/test/cpp/pir/distributed/dist_dialect_test.cc
+++ b/test/cpp/pir/distributed/dist_dialect_test.cc
@@ -16,9 +16,13 @@
 
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_op.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/include/core/builtin_type.h"
+#include "paddle/pir/include/core/program.h"
 
 using namespace paddle::dialect;  // NOLINT
 
@@ -228,3 +232,163 @@ TEST(operation_dist_attr_test, base) {
   EXPECT_EQ(op_attr.result_dist_attr(0), result_dist_attrs.at(0));
   EXPECT_EQ(op_attr.num_result_dist_attrs(), (uint32_t)1);
 }
+
+TEST(shard_tensor_op_replicate_test, base) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<DistDialect>();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+
+  pir::Program program(ctx);
+  pir::Block* block = program.block();
+  pir::Builder builder(ctx, block);
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  phi::distributed::ProcessMesh process_mesh(
+      mesh_shape, process_ids, dim_names);
+  auto mesh_attr = ProcessMeshAttribute::get(ctx, process_mesh);
+
+  std::vector<int64_t> data_shape = {12, 6};
+  paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
+
+  // construct a replicated
+  std::vector<int64_t> dims_mapping = {-1, -1};
+
+  auto data_op = builder.Build<paddle::dialect::DataOp>(
+      "w0", data_shape, phi::DataType::FLOAT32, phi::CPUPlace());
+
+  std::vector<int64_t> local_shape = {12, 6};
+  auto tensor_dist_attr =
+      TensorDistAttribute::get(ctx, mesh_attr, dims_mapping, partial_status);
+
+  pir::AttributeMap attr_map = {{"tensor_dist_attr", tensor_dist_attr}};
+
+  paddle::dialect::ShardTensorOp shard_op =
+      builder.Build<paddle::dialect::ShardTensorOp>(data_op.result(0),
+                                                    attr_map);
+
+  EXPECT_TRUE(shard_op.out().type().isa<DistDenseTensorType>());
+  auto op_out_type = shard_op.out().type().dyn_cast<DistDenseTensorType>();
+  EXPECT_EQ(op_out_type.global_ddim(), phi::make_ddim(data_shape));
+  EXPECT_EQ(op_out_type.local_ddim(), phi::make_ddim(local_shape));
+  EXPECT_EQ(op_out_type.process_mesh_attr(), mesh_attr);
+  EXPECT_EQ(op_out_type.dims_mapping(), dims_mapping);
+  EXPECT_EQ(op_out_type.partial_dims().size(), (size_t)0);
+
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .num_operand_dist_attrs(),
+            (uint32_t)0);
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .num_result_dist_attrs(),
+            (uint32_t)1);
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .process_mesh_attr(),
+            mesh_attr);
+}
+
+TEST(shard_tensor_op_shard_row_test, base) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<DistDialect>();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+
+  pir::Program program(ctx);
+  pir::Block* block = program.block();
+  pir::Builder builder(ctx, block);
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  phi::distributed::ProcessMesh process_mesh(
+      mesh_shape, process_ids, dim_names);
+  auto mesh_attr = ProcessMeshAttribute::get(ctx, process_mesh);
+
+  std::vector<int64_t> data_shape = {12, 6};
+  paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
+
+  // construct a row shard
+  std::vector<int64_t> dims_mapping = {1, -1};
+  auto data_op = builder.Build<paddle::dialect::DataOp>(
+      "w1", data_shape, phi::DataType::FLOAT32, phi::CPUPlace());
+
+  std::vector<int64_t> local_shape = {4, 6};
+  auto tensor_dist_attr =
+      TensorDistAttribute::get(ctx, mesh_attr, dims_mapping, partial_status);
+
+  pir::AttributeMap attr_map = {{"tensor_dist_attr", tensor_dist_attr}};
+
+  paddle::dialect::ShardTensorOp shard_op =
+      builder.Build<paddle::dialect::ShardTensorOp>(data_op.result(0),
+                                                    attr_map);
+
+  EXPECT_TRUE(shard_op.out().type().isa<DistDenseTensorType>());
+  auto op_out_type = shard_op.out().type().dyn_cast<DistDenseTensorType>();
+  EXPECT_EQ(op_out_type.global_ddim(), phi::make_ddim(data_shape));
+  EXPECT_EQ(op_out_type.local_ddim(), phi::make_ddim(local_shape));
+  EXPECT_EQ(op_out_type.process_mesh_attr(), mesh_attr);
+  EXPECT_EQ(op_out_type.dims_mapping(), dims_mapping);
+  EXPECT_EQ(op_out_type.partial_dims().size(), (size_t)0);
+
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .num_operand_dist_attrs(),
+            (uint32_t)0);
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .num_result_dist_attrs(),
+            (uint32_t)1);
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .process_mesh_attr(),
+            mesh_attr);
+}
+
+TEST(shard_tensor_op_shard_col_test, base) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<DistDialect>();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+
+  pir::Program program(ctx);
+  pir::Block* block = program.block();
+  pir::Builder builder(ctx, block);
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  phi::distributed::ProcessMesh process_mesh(
+      mesh_shape, process_ids, dim_names);
+  auto mesh_attr = ProcessMeshAttribute::get(ctx, process_mesh);
+
+  std::vector<int64_t> data_shape = {12, 6};
+  paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
+
+  // construct a col shard
+  std::vector<int64_t> dims_mapping = {-1, 0};
+
+  auto data_op = builder.Build<paddle::dialect::DataOp>(
+      "w2", data_shape, phi::DataType::FLOAT32, phi::CPUPlace());
+
+  std::vector<int64_t> local_shape = {12, 3};
+  auto tensor_dist_attr =
+      TensorDistAttribute::get(ctx, mesh_attr, dims_mapping, partial_status);
+
+  pir::AttributeMap attr_map = {{"tensor_dist_attr", tensor_dist_attr}};
+  paddle::dialect::ShardTensorOp shard_op =
+      builder.Build<paddle::dialect::ShardTensorOp>(data_op.result(0),
+                                                    attr_map);
+
+  EXPECT_TRUE(shard_op.out().type().isa<DistDenseTensorType>());
+  auto op_out_type = shard_op.out().type().dyn_cast<DistDenseTensorType>();
+  EXPECT_EQ(op_out_type.global_ddim(), phi::make_ddim(data_shape));
+  EXPECT_EQ(op_out_type.local_ddim(), phi::make_ddim(local_shape));
+  EXPECT_EQ(op_out_type.process_mesh_attr(), mesh_attr);
+  EXPECT_EQ(op_out_type.dims_mapping(), dims_mapping);
+  EXPECT_EQ(op_out_type.partial_dims().size(), (size_t)0);
+
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .num_operand_dist_attrs(),
+            (uint32_t)0);
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .num_result_dist_attrs(),
+            (uint32_t)1);
+  EXPECT_EQ(shard_op.attribute<OperationDistAttribute>("op_dist_attr")
+                .process_mesh_attr(),
+            mesh_attr);
+}

From 6255e8b66d7409f971080512b0d21543f2998cb4 Mon Sep 17 00:00:00 2001
From: ronnywang <ronny1996@163.com>
Date: Fri, 8 Mar 2024 21:56:39 +0800
Subject: [PATCH 085/114] [CustomDevice] fix ToCDataType (#62562)

---
 paddle/phi/backends/custom/custom_device.cc | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index 30282eac79afb..2f0da05d43c4a 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -592,13 +592,21 @@ class CustomDevice : public DeviceInterface {
   case in:                     \
     return C_DataType::ret
     switch (data_type) {
-      return_result(phi::DataType::FLOAT64, FLOAT64);
-      return_result(phi::DataType::FLOAT32, FLOAT32);
-      return_result(phi::DataType::FLOAT16, FLOAT16);
-      return_result(phi::DataType::INT64, INT64);
-      return_result(phi::DataType::INT32, INT32);
-      return_result(phi::DataType::INT16, INT16);
+      return_result(phi::DataType::BOOL, BOOL);
+      return_result(phi::DataType::UINT8, UINT8);
+      return_result(phi::DataType::UINT16, UINT16);
+      return_result(phi::DataType::UINT32, UINT32);
+      return_result(phi::DataType::UINT64, UINT64);
       return_result(phi::DataType::INT8, INT8);
+      return_result(phi::DataType::INT16, INT16);
+      return_result(phi::DataType::INT32, INT32);
+      return_result(phi::DataType::INT64, INT64);
+      return_result(phi::DataType::FLOAT16, FLOAT16);
+      return_result(phi::DataType::FLOAT32, FLOAT32);
+      return_result(phi::DataType::FLOAT64, FLOAT64);
+      return_result(phi::DataType::BFLOAT16, BFLOAT16);
+      return_result(phi::DataType::COMPLEX64, COMPLEX64);
+      return_result(phi::DataType::COMPLEX128, COMPLEX128);
       default: {
         PADDLE_THROW(phi::errors::Unavailable(
             "DataType is not supported on %s.", Type()));

From b11f7f5719977f0297a519b31cc98e42ce0a2dd5 Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Sat, 9 Mar 2024 00:16:28 +0800
Subject: [PATCH 086/114] [PIR] support infer spmd auto gen. (#62547)

---
 paddle/fluid/pir/dialect/CMakeLists.txt       |  5 +-
 .../dialect/op_generator/op_all_func_gen.py   | 39 +++++++++++
 .../fluid/pir/dialect/op_generator/op_gen.py  | 54 +++++++--------
 .../op_generator/op_infer_spmd_func_gen.py    | 68 +++++++++++++++++++
 .../dialect/op_generator/op_infermeta_gen.py  | 10 +++
 ...nc_gen.py => op_member_access_func_gen.py} | 12 ++--
 .../op_generator/op_vjp_interface_func_gen.py | 26 +++++++
 7 files changed, 180 insertions(+), 34 deletions(-)
 create mode 100644 paddle/fluid/pir/dialect/op_generator/op_all_func_gen.py
 create mode 100644 paddle/fluid/pir/dialect/op_generator/op_infer_spmd_func_gen.py
 rename paddle/fluid/pir/dialect/op_generator/{op_member_func_gen.py => op_member_access_func_gen.py} (79%)
 create mode 100644 paddle/fluid/pir/dialect/op_generator/op_vjp_interface_func_gen.py

diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
index b0606b59b28f8..380c7c72d8028 100644
--- a/paddle/fluid/pir/dialect/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -95,7 +95,8 @@ execute_process(
     --op_compat_yaml_file ${op_compat_yaml_file} --namespaces ${op_namespace}
     --dialect_name ${dialect_name} --op_def_h_file ${op_header_file_tmp}
     --op_info_file ${op_info_file_tmp} --op_def_cc_file ${op_src_files_tmp}
-    --op_vjp_cc_file ${op_vjp_src_file_tmp})
+    --op_vjp_cc_file ${op_vjp_src_file_tmp} --with_distributed
+    ${WITH_DISTRIBUTE})
 
 set(generated_files_pd_op
     "${op_header_file}"
@@ -141,7 +142,7 @@ if(WITH_MKLDNN)
       --op_def_h_file ${onednn_op_header_file_tmp} --op_info_file
       ${op_onednn_info_file_tmp} --op_def_cc_file ${onednn_op_source_file_tmp}
       --onednn_yaml_file ${pir_op_onednn_yaml} --ops_onednn_extra_yaml_file
-      ${pd_ops_onednn_extra_yaml_file})
+      ${pd_ops_onednn_extra_yaml_file} --with_distributed ${WITH_DISTRIBUTE})
 
   set(generated_files_onednn_pd_op
       "${onednn_op_header_file}" "${onednn_op_source_file}"
diff --git a/paddle/fluid/pir/dialect/op_generator/op_all_func_gen.py b/paddle/fluid/pir/dialect/op_generator/op_all_func_gen.py
new file mode 100644
index 0000000000000..2c87a55e540d9
--- /dev/null
+++ b/paddle/fluid/pir/dialect/op_generator/op_all_func_gen.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from op_infer_spmd_func_gen import gen_op_infer_spmd_func
+from op_infermeta_gen import gen_op_infermeta_func
+from op_member_access_func_gen import gen_op_member_access_func
+from op_vjp_interface_func_gen import gen_op_vjp_interface_func
+
+all_gen_op_func_list = [
+    gen_op_infer_spmd_func,
+    gen_op_infermeta_func,
+    gen_op_member_access_func,
+    gen_op_vjp_interface_func,
+]
+
+
+def gen_op_all_func(args, op_info, op_info_items):
+    interface_list = []
+    declare_list = []
+    impl_list = []
+    for func in all_gen_op_func_list:
+        interface, declare, impl = func(args, op_info, op_info_items)
+        interface_list += interface
+        if declare is not None:
+            declare_list.append(declare)
+        if impl is not None:
+            impl_list.append(impl)
+    return interface_list, declare_list, impl_list
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 5513bbb3f5552..976d5a9d53728 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -17,10 +17,12 @@
 import os
 import pathlib
 import sys
+from distutils.util import strtobool
 
 import yaml
 from decomp_interface_gen_op_list import decomp_interface_declare_gen_op_list
 from infer_symbolic_shape_gen import gen_infer_symbolic_shape_str
+from op_all_func_gen import gen_op_all_func
 from op_build_gen import gen_build_func_str, gen_build_func_str_by_invoke
 from op_infermeta_gen import (
     gen_infermeta_by_invoke_func_str,
@@ -32,7 +34,6 @@
     gen_op_vjp_str,
 )
 from op_kerneltype_gen import gen_kernel_type_for_var_str
-from op_member_func_gen import gen_op_get_inputs_outputs_str
 from op_verify_gen import gen_verify_func_str
 from ops_onednn_extra_parser import parse_data_format_tensors, parse_extra_args
 from parse_kernel_key_gen import gen_parse_kernel_key_str
@@ -107,6 +108,9 @@
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 #include "paddle/fluid/pir/dialect/operator/trait/custom_vjp.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/infermeta/spmd_rules/rules.h"
+#endif
 {only_pd_op_header_files}
 
 {other_info}
@@ -147,7 +151,6 @@ class {TEST_API} {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
 {get_kernel_type_for_var_declare}
 {parse_kernel_key_declare}
 {infer_symbolic_shape_declare}
-{get_inputs_and_outputs}
 {exclusive_interface}
 }};
 """
@@ -503,8 +506,13 @@ def __init__(self, op_yaml_item, op_compat_item):
         # parse infermeta && kernel
         self.infer_meta_map = self.parse_infer_meta_map()
         self.invoke_map = self.parse_invoke_map()
+        self.spmd_rule_func = None
         if 'infer_meta' in self.op_yaml_item:
             self.infer_meta_func = self.op_yaml_item['infer_meta']["func"]
+            if 'spmd_rule' in self.op_yaml_item['infer_meta']:
+                self.spmd_rule_func = self.op_yaml_item['infer_meta'][
+                    'spmd_rule'
+                ]
         else:
             self.infer_meta_func = None
 
@@ -1233,7 +1241,9 @@ def GenOneDnnExtraAttrsDefaultValue(onednn_extra_args):
     return attr_str
 
 
-def AutoCodeGen(op_info_items, all_op_info_items, namespaces, dialect_name):
+def AutoCodeGen(
+    args, op_info_items, all_op_info_items, namespaces, dialect_name
+):
     # (3) CodeGen: Traverse op_info_items and generate
     ops_name_list = []  # all op class name store in this list
     ops_declare_list = []  # all op class declare store in this list
@@ -1291,23 +1301,17 @@ def AutoCodeGen(op_info_items, all_op_info_items, namespaces, dialect_name):
         op_traits = op_info.traits_list
         op_interfaces = op_info.interfaces_list
         op_interfaces += ["paddle::dialect::OpYamlInfoInterface"]
-
-        if op_info.infer_meta_func:
-            op_interfaces += ["paddle::dialect::InferMetaInterface"]
-        elif op_invoke_map and op_invoke_map['func'] in op_info_items:
-            if op_info_items[op_invoke_map['func']].infer_meta_func:
-                op_interfaces += ["paddle::dialect::InferMetaInterface"]
-
-        if (
-            op_info.backward_name
-            and op_info.op_phi_name[0] not in vjp_interface_black_list
-            and dialect_name != "onednn_op"
-        ):
-            op_interfaces += ["paddle::dialect::VjpInterface"]
         exclusive_interface_str = gen_exclusive_interface_str(
             op_info, op_info_items
         )
 
+        interface_list, declare_list, impl_list = gen_op_all_func(
+            args, op_info, op_info_items
+        )
+        op_interfaces += interface_list
+        exclusive_interface_str += '\n' + '\n'.join(declare_list)
+        ops_defined_list += impl_list
+
         if dialect_name == "pd_op" or dialect_name == "onednn_op":
             op_interfaces += ["paddle::dialect::GetKernelTypeForVarInterface"]
 
@@ -1409,15 +1413,6 @@ def AutoCodeGen(op_info_items, all_op_info_items, namespaces, dialect_name):
                         kernel_func_name
                     ]
 
-                # =================================== #
-                #  gen get input/output methods str   #
-                # =================================== #
-                op_get_inputs_outputs_str = gen_op_get_inputs_outputs_str(
-                    op_input_name_list,
-                    op_mutable_attribute_name_list,
-                    op_output_name_list,
-                )
-
                 # =================================== #
                 #         gen Build methods str       #
                 # =================================== #
@@ -1581,7 +1576,6 @@ def AutoCodeGen(op_info_items, all_op_info_items, namespaces, dialect_name):
                         build_mutable_attr_is_input=build_mutable_attr_is_input,
                         build_attr_num_over_1=build_attr_num_over_1,
                         build_mutable_attr_is_input_attr_num_over_1=build_mutable_attr_is_input_attr_num_over_1,
-                        get_inputs_and_outputs=op_get_inputs_outputs_str,
                         exclusive_interface=exclusive_interface_str,
                         get_kernel_type_for_var_declare=get_kernel_type_for_var_declare_str,
                         parse_kernel_key_declare=parse_kernel_key_str,
@@ -1605,7 +1599,6 @@ def AutoCodeGen(op_info_items, all_op_info_items, namespaces, dialect_name):
                         build_mutable_attr_is_input=build_mutable_attr_is_input,
                         build_attr_num_over_1=build_attr_num_over_1,
                         build_mutable_attr_is_input_attr_num_over_1=build_mutable_attr_is_input_attr_num_over_1,
-                        get_inputs_and_outputs=op_get_inputs_outputs_str,
                         exclusive_interface=exclusive_interface_str,
                         get_kernel_type_for_var_declare=get_kernel_type_for_var_declare_str,
                         parse_kernel_key_declare=parse_kernel_key_str,
@@ -2059,6 +2052,7 @@ def AutoCodeGen(op_info_items, all_op_info_items, namespaces, dialect_name):
 
 
 def OpGenerator(
+    args,
     op_yaml_files,
     op_compat_yaml_file,
     namespaces,
@@ -2206,7 +2200,9 @@ def OpGenerator(
             source_file_str,
             op_to_multi_kernels_list,
             vjp_source_file_str,
-        ) = AutoCodeGen(items, all_op_info_items, namespaces, dialect_name)
+        ) = AutoCodeGen(
+            args, items, all_op_info_items, namespaces, dialect_name
+        )
         op_list_strs.append(op_list_str)
         declare_type_id_strs.append(declare_type_id_str)
         define_type_id_strs.append(define_type_id_str)
@@ -2360,6 +2356,7 @@ def ParseArguments():
     parser.add_argument('--op_vjp_cc_file', type=str)
     parser.add_argument('--onednn_yaml_file', type=str)
     parser.add_argument('--ops_onednn_extra_yaml_file', type=str)
+    parser.add_argument('--with_distributed', type=strtobool)
     return parser.parse_args()
 
 
@@ -2384,6 +2381,7 @@ def ParseArguments():
 
     # auto code generate
     OpGenerator(
+        args,
         op_yaml_files,
         op_compat_yaml_file,
         namespaces,
diff --git a/paddle/fluid/pir/dialect/op_generator/op_infer_spmd_func_gen.py b/paddle/fluid/pir/dialect/op_generator/op_infer_spmd_func_gen.py
new file mode 100644
index 0000000000000..b14453f44236c
--- /dev/null
+++ b/paddle/fluid/pir/dialect/op_generator/op_infer_spmd_func_gen.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+OP_INFER_SPMD_TEMPLATE = """
+  static phi::distributed::SpmdInfo InferSpmd({infer_spmd_args}) {{
+    return phi::distributed::{func}({args});
+  }}
+"""
+
+
+def gen_op_infer_spmd_func(args, op_info, op_info_items):
+    if not args.with_distributed or not op_info.spmd_rule_func:
+        return [], None, None
+    input_types_map = {
+        'paddle::dialect::DenseTensorType': 'const phi::distributed::DistMetaTensor&',
+        'pir::VectorType<paddle::dialect::DenseTensorType>': 'const std::vector<phi::distributed::DistMetaTensor>&',
+    }
+    input_name_list = op_info.input_name_list
+    input_type_list = op_info.input_type_list
+    input_name_type_dict = {}
+    for attr_idx in range(len(input_name_list)):
+        input_name_type_dict[input_name_list[attr_idx]] = input_types_map[
+            input_type_list[attr_idx]
+        ]
+
+    attr_name_list = op_info.attribute_name_list
+    attr_type_list = op_info.attribute_gen_arg_type_list
+    attr_name_type_dict = {}
+    for attr_idx in range(len(attr_type_list)):
+        attr_name_type_dict[attr_name_list[attr_idx]] = attr_type_list[attr_idx]
+
+    spmd_params = input_name_list + attr_name_list
+    if op_info.kernel_map is not None:
+        spmd_params = op_info.kernel_map['param']
+    args_list_with_type = []
+    args_list = []
+    for param in spmd_params:
+        # is input
+        if param in op_info.input_name_list:
+            args_list_with_type.append(
+                input_name_type_dict[param] + " " + param
+            )
+            args_list.append(param)
+        # is attribute
+        else:
+            param_type = attr_name_type_dict[param]
+            if param_type == "phi::IntArray":
+                param_type = "const std::vector<int64_t>&"
+            args_list_with_type.append(param_type + " " + param)
+            args_list.append(param)
+
+    declare_str = OP_INFER_SPMD_TEMPLATE.format(
+        infer_spmd_args=', '.join(args_list_with_type),
+        func=op_info.infer_meta_map["spmd_rule"],
+        args=', '.join(args_list),
+    )
+    return [], declare_str, None
diff --git a/paddle/fluid/pir/dialect/op_generator/op_infermeta_gen.py b/paddle/fluid/pir/dialect/op_generator/op_infermeta_gen.py
index 50648daeeec30..1d1c3cda8071d 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_infermeta_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_infermeta_gen.py
@@ -674,3 +674,13 @@ def gen_infermeta_by_invoke_func_str(op_class_name, invoke_class_name):
     return OP_INFERMETA_BY_INVOKE_TEMPLATE.format(
         op_name=op_class_name, invoke_class=invoke_class_name
     )
+
+
+def gen_op_infermeta_func(args, op_info, op_info_items):
+    interface = []
+    if op_info.infer_meta_func:
+        interface = ["paddle::dialect::InferMetaInterface"]
+    elif op_info.invoke_map and op_info.invoke_map['func'] in op_info_items:
+        if op_info_items[op_info.invoke_map['func']].infer_meta_func:
+            interface = ["paddle::dialect::InferMetaInterface"]
+    return interface, None, None
diff --git a/paddle/fluid/pir/dialect/op_generator/op_member_func_gen.py b/paddle/fluid/pir/dialect/op_generator/op_member_access_func_gen.py
similarity index 79%
rename from paddle/fluid/pir/dialect/op_generator/op_member_func_gen.py
rename to paddle/fluid/pir/dialect/op_generator/op_member_access_func_gen.py
index dd060692bd078..98e4e8de66e80 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_member_func_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_member_access_func_gen.py
@@ -20,9 +20,13 @@
 """
 
 
-def gen_op_get_inputs_outputs_str(
-    op_input_name_list, op_mutable_attribute_name_list, op_output_name_list
-):
+# =================================== #
+#  gen get input/output methods str   #
+# =================================== #
+def gen_op_member_access_func(args, op_info, op_info_items):
+    op_input_name_list = op_info.input_name_list
+    op_mutable_attribute_name_list = op_info.mutable_attribute_name_list
+    op_output_name_list = op_info.output_name_list
     op_get_inputs_outputs_str = ""
     for idx in range(len(op_input_name_list)):
         op_get_inputs_outputs_str += OP_GET_INPUT_TEMPLATE.format(
@@ -39,4 +43,4 @@ def gen_op_get_inputs_outputs_str(
             output_name=op_output_name_list[idx],
             output_index=idx,
         )
-    return op_get_inputs_outputs_str
+    return [], op_get_inputs_outputs_str, None
diff --git a/paddle/fluid/pir/dialect/op_generator/op_vjp_interface_func_gen.py b/paddle/fluid/pir/dialect/op_generator/op_vjp_interface_func_gen.py
new file mode 100644
index 0000000000000..53ff6b8e50eb4
--- /dev/null
+++ b/paddle/fluid/pir/dialect/op_generator/op_vjp_interface_func_gen.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from vjp_interface_black_list import vjp_interface_black_list
+
+
+def gen_op_vjp_interface_func(args, op_info, op_info_items):
+    if (
+        op_info.backward_name
+        and op_info.op_phi_name[0] not in vjp_interface_black_list
+        and args.dialect_name != "onednn_op"
+    ):
+        return ["paddle::dialect::VjpInterface"], None, None
+    else:
+        return [], None, None

From bb86d5184b15f6b5219831b11e15ddeb23ebf563 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Sat, 9 Mar 2024 09:12:53 +0800
Subject: [PATCH 087/114] Support empty reduce axis (#62542)

* support spatial dynamic

* fix bug

* fix dyshape buffer resize

* update

* update

* fix bug

* polish code

* fix bug

* polish code

* fix test while dy bug

---------

Co-authored-by: BiynXu <244524405@qq.com>
---
 .../transforms/cinn_group_cluster_pass.cc     | 40 +++++++++++++++++++
 .../hlir/framework/pir/op_lowering_impl.cc    | 18 ++++++++-
 paddle/cinn/hlir/pe/reduction.cc              |  7 ++++
 .../group_schedule/tactic/schedule_tactic.h   |  2 +
 .../tactic/tile_first_general_tactic.cc       | 39 ++++++++++++++----
 paddle/cinn/optim/resize_buffer.cc            | 17 +++++++-
 test/cpp/pir/cinn/group_op_test.cc            |  3 +-
 test/cpp/pir/cinn/jit_instruction_test.cc     |  7 ++--
 test/ir/pir/cinn/CMakeLists.txt               | 13 +++++-
 test/ir/pir/cinn/inference/CMakeLists.txt     |  2 +-
 .../ir/pir/cinn/inference/test_llama_while.py | 20 ++++------
 test/ir/pir/cinn/symbolic/CMakeLists.txt      | 13 +-----
 test/ir/pir/cinn/symbolic/test_while_dy.py    | 12 +++---
 test/ir/pir/cinn/test_cinn_ops.py             | 16 ++++----
 14 files changed, 153 insertions(+), 56 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
index 05268617ba149..0c6e3bf864404 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
@@ -339,6 +339,7 @@ ::pir::Operation* ReplaceWithGroupOp(
                                                 group_ops.end());
 
   std::vector<::pir::Value> new_output;
+
   for (size_t i = 0; i < output_value.size(); ++i) {
     new_output.push_back(ir_mapping->Lookup<::pir::Value>(output_value[i]));
   }
@@ -526,6 +527,11 @@ void GetClusterNodeBasicInfo(::pir::Operation* op,
                            .type()
                            .dyn_cast<paddle::dialect::DenseTensorType>()
                            .dims());
+    if (cluster_node->reduce_axis.size() == 0) {
+      for (size_t i = 0; i < cluster_node->loop_ranges.size(); ++i) {
+        cluster_node->reduce_axis.push_back(i);
+      }
+    }
   } else if (cluster_node->group_kind == cinn::hlir::framework::kElementWise) {
     cluster_node->loop_ranges =
         phi::vectorize(op->result(0)
@@ -577,6 +583,19 @@ bool CanOpMergeNode(
     return false;
   }
 
+  if (cinn::hlir::framework::pir::CompatibleInfo::OpKind(*cur_op) ==
+      cinn::hlir::framework::kReduction) {
+    if (cinn::dialect::ir::GetVectorAttr(cur_op, "dim").size() == 0 ||
+        cinn::dialect::ir::GetVectorAttr(cur_op, "dim").size() ==
+            cur_op->operand_source(0)
+                .type()
+                .dyn_cast<paddle::dialect::DenseTensorType>()
+                .dims()
+                .size()) {
+      return false;
+    }
+  }
+
   // TODO(phlrain): need update here
   // different loop range can merge, like [128, 128, 1], with [128, 128]
   if ((cinn::hlir::framework::pir::CompatibleInfo::OpKind(*cur_op) !=
@@ -598,6 +617,19 @@ bool ShouldOutputPreNode(
     return false;
   }
 
+  if (cinn::hlir::framework::pir::CompatibleInfo::OpKind(*cur_op) ==
+      cinn::hlir::framework::kReduction) {
+    if (cinn::dialect::ir::GetVectorAttr(cur_op, "dim").size() == 0 ||
+        cinn::dialect::ir::GetVectorAttr(cur_op, "dim").size() ==
+            cur_op->operand_source(0)
+                .type()
+                .dyn_cast<paddle::dialect::DenseTensorType>()
+                .dims()
+                .size()) {
+      return true;
+    }
+  }
+
   // TODO(phlrain): need update here
   // different loop range can merge, like [128, 128, 1], with [128, 128]
   if ((cinn::hlir::framework::pir::CompatibleInfo::OpKind(*cur_op) !=
@@ -841,9 +873,17 @@ class CinnGroupClusterPattern
       auto new_group_op = ReplaceWithGroupOp(
           &rewriter, uniq_ops, node, output_values, &ir_mapping);
 
+      auto& shape_analysis = pir::ShapeAnalysisManager::Instance().Get(
+          group_op->GetParentProgram());
       // update ir mapping
       for (size_t i = 0; i < output_values.size(); ++i) {
         ir_mapping.Add(output_values[i], new_group_op->result(i));
+
+        if (shape_analysis.HasShapeOrDataForValue(output_values[i])) {
+          shape_analysis.SetShapeOrDataForValue(
+              new_group_op->result(i),
+              shape_analysis.GetShapeOrDataForValue(output_values[i]));
+        }
       }
 
       for (size_t i = 0; i < output_values.size(); ++i) {
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index db489a190ff1b..110616885b768 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -114,6 +114,13 @@ std::shared_ptr<cinn::ir::GroupTileInfo> OpLowererImpl::GetGroupTileInfo(
     }
   }
 
+  bool is_reduce_all =
+      (group_tile_info->reduce_axis_.size() == group_tile_info->data_rank);
+
+  if (is_reduce_all) {
+    reduce_is_dynamic = false;
+  }
+
   PADDLE_ENFORCE_EQ(
       reduce_is_dynamic,
       false,
@@ -125,8 +132,17 @@ std::shared_ptr<cinn::ir::GroupTileInfo> OpLowererImpl::GetGroupTileInfo(
   int64_t reduce_inner_num = 1;
   int64_t spatial_inner_num = 1;
   int warp_num = 1;
+  group_tile_info->is_reduce_all = is_reduce_all;
+
+  if (is_reduce_all) {
+    // warp reduce
+    reduce_block = 1024;
+    spatial_block = 1;
+    spatial_inner_num = 1;
+    reduce_inner_num = 4;
+    warp_num = 8;
 
-  if (reduce_numel == 1) {
+  } else if (reduce_numel == 1) {
     reduce_block = 1;
     if (spatial_is_dynamic) {
       spatial_block = 1024;
diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
index 605a1b3d6443f..a6b444f9865bd 100644
--- a/paddle/cinn/hlir/pe/reduction.cc
+++ b/paddle/cinn/hlir/pe/reduction.cc
@@ -129,6 +129,13 @@ void GetOutputShape(const std::vector<int>& real_axes,
   if (output_shape->empty()) {
     output_shape->push_back(cinn::common::make_one());
   }
+
+  CHECK(!tensor->shape.empty());
+  if (tensor->shape[0]->type() == Int(64)) {
+    for (auto& shape_item : *output_shape) {
+      shape_item->convert_int32_to_int64();
+    }
+  }
 }
 
 /*!
diff --git a/paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h b/paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h
index ef3d4817949b2..c4e37ca7df613 100644
--- a/paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h
+++ b/paddle/cinn/ir/group_schedule/tactic/schedule_tactic.h
@@ -85,6 +85,8 @@ struct GroupTileInfo {
   int64_t reduce_inner_num;
   int64_t reduce_block;
 
+  bool is_reduce_all{false};
+
   std::set<std::string> reduce_tensor_names;
   std::set<std::string> temp_var_names;
 
diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
index 165242258ef1b..035a59ae9582c 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
@@ -71,6 +71,9 @@ void TileFirstGeneralTactic::Init(ScheduleContext* context) {
   context_ = context;
   reduce_current_axis_ =
       IsInnerThreadSpatialLoopGT(context_->group_tile_info, 1) ? 2 : 1;
+  if (context_->group_tile_info->is_reduce_all) {
+    reduce_current_axis_ = 0;
+  }
   // reduce axis have be re-order to last
   vec_flatten_axis_.clear();
   vec_reduce_axis_.clear();
@@ -135,9 +138,12 @@ void TileFirstGeneralTactic::MergeReduceAxis(ir::IRSchedule* sch,
   std::vector<int32_t> fuse_axis = vec_reduce_axis_;
   if (vec_reduce_axis_.size() >= 2) {
     for (size_t i = 0; i < fuse_axis.size(); ++i) {
-      fuse_axis[i] -= (vec_flatten_axis_.size() - 1);
+      if (vec_flatten_axis_.size() > 2) {
+        fuse_axis[i] -= (vec_flatten_axis_.size() - 1);
+      }
     }
   }
+
   if (vec_reduce_axis_.size() >= 2 && !ir::IsReduceInitTensorName(block_id)) {
     sch->Fuse(block_id, fuse_axis);
   }
@@ -160,7 +166,8 @@ void TileFirstGeneralTactic::SplitReduceInner(ir::IRSchedule* sch,
   auto loops = sch->GetLoops(block_id);
   auto reduce_loop = loops[reduce_current_axis_].As<ir::For>();
 
-  if (ir::GetLoopExtent(reduce_loop) == 1) {
+  if (reduce_loop->extent.is_constant() &&
+      ir::GetLoopExtent(reduce_loop) == 1) {
     return;
   }
 
@@ -168,7 +175,10 @@ void TileFirstGeneralTactic::SplitReduceInner(ir::IRSchedule* sch,
     return context_->group_tile_info->reduce_block >= num;
   };
   std::vector<int> split_factors;
-  if (IsReduceBlockGE(2048)) {
+  if (context_->group_tile_info->is_reduce_all) {
+    split_factors.push_back(256);
+    split_factors.push_back(-1);
+  } else if (IsReduceBlockGE(2048)) {
     split_factors.emplace_back(
         std::ceil(context_->group_tile_info->reduce_numel * 1.0 /
                   context_->group_tile_info->reduce_inner_num));
@@ -241,19 +251,27 @@ void TileFirstGeneralTactic::Unroll(ir::IRSchedule* sch,
                                     const std::string& block_id) {
   auto loops = sch->GetLoops(block_id);
   if (loops.size() > 2) {
-    sch->Unroll(loops[2]);
+    if (loops[2].As<ir::For>()->extent.is_constant()) {
+      sch->Unroll(loops[2]);
+    }
   }
   if (loops.size() > 3) {
-    sch->Unroll(loops[3]);
+    if (loops[3].As<ir::For>()->extent.is_constant()) {
+      sch->Unroll(loops[3]);
+    }
   }
 
   if (IsReduceBlock(context_->group_tile_info, block_id)) {
     auto loops = sch->GetLoops(block_id + "_rf");
     if (loops.size() > 2) {
-      sch->Unroll(loops[2]);
+      if (loops[2].As<ir::For>()->extent.is_constant()) {
+        sch->Unroll(loops[2]);
+      }
     }
     if (loops.size() > 3) {
-      sch->Unroll(loops[3]);
+      if (loops[3].As<ir::For>()->extent.is_constant()) {
+        sch->Unroll(loops[3]);
+      }
     }
   }
 }
@@ -289,7 +307,7 @@ void TileFirstGeneralTactic::SetReduceType(ir::IRSchedule* sch,
 void TileFirstGeneralTactic::BindCudaInfo(ir::IRSchedule* sch,
                                           const std::string& block_id) {
   auto loops = sch->GetLoops(block_id);
-  if (loops.size() == 1) {
+  if (loops.size() == 1 || context_->group_tile_info->is_reduce_all) {
     sch->Split(loops[0], std::vector<int>({1, -1}));
   }
 
@@ -299,6 +317,11 @@ void TileFirstGeneralTactic::BindCudaInfo(ir::IRSchedule* sch,
 
   if (IsReduceBlock(context_->group_tile_info, block_id)) {
     auto loops = sch->GetLoops(block_id + "_rf");
+    if (context_->group_tile_info->is_reduce_all) {
+      sch->Split(loops[0], std::vector<int>({1, -1}));
+    }
+
+    loops = sch->GetLoops(block_id + "_rf");
     sch->Bind(loops[0], "blockIdx.x");
     sch->Bind(loops[1], "threadIdx.x");
   }
diff --git a/paddle/cinn/optim/resize_buffer.cc b/paddle/cinn/optim/resize_buffer.cc
index e73929a97aa57..1f925f653b492 100644
--- a/paddle/cinn/optim/resize_buffer.cc
+++ b/paddle/cinn/optim/resize_buffer.cc
@@ -16,6 +16,7 @@
 #include <unordered_map>
 
 #include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/common/integer_set.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/ir_printer.h"
@@ -168,8 +169,20 @@ class AnalyzeLoopVarRange : public ir::IRMutator<> {
       }
     }
     ir::Expr tmp = ir::Add::Make(copy, ir::Expr(1));
-    ir::Expr simplify = common::AutoSimplify(tmp);
-    return simplify;
+    ir::Expr simplified = common::AutoSimplify(tmp);
+    if (simplified.As<ir::Min>()) {
+      ir::Expr lhs = simplified.As<ir::Min>()->a();
+      ir::Expr rhs = simplified.As<ir::Min>()->b();
+      common::cas_intervals_t var_intervals =
+          common::CollectVarIntervalsOfExprs({lhs, rhs});
+      common::SymbolicExprAnalyzer analyzer(var_intervals);
+      if (analyzer.ProveLE(lhs, rhs)) {
+        return lhs;
+      } else if (analyzer.ProveGE(lhs, rhs)) {
+        return rhs;
+      }
+    }
+    return simplified;
   }
 
  public:
diff --git a/test/cpp/pir/cinn/group_op_test.cc b/test/cpp/pir/cinn/group_op_test.cc
index e4ac41a7b9c52..5be7a107b4c60 100644
--- a/test/cpp/pir/cinn/group_op_test.cc
+++ b/test/cpp/pir/cinn/group_op_test.cc
@@ -19,6 +19,7 @@
 
 #include "paddle/cinn/hlir/dialect/operator/ir/manual_op.h"
 #include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/lower_cinn_fusion_op_pass.h"
 #include "paddle/cinn/hlir/framework/pir/group.h"
@@ -209,7 +210,7 @@ TEST(GroupOp, CINNLowering) {
 
   pir::IrContext* ctx = pir::IrContext::Instance();
   pir::PassManager pass_manager(ctx);
-  pass_manager.AddPass(cinn::dialect::ir::CreateDivideGroupOpToFusionOpPass());
+  pass_manager.AddPass(cinn::dialect::ir::CreateCinnGroupClusterPass());
   pass_manager.AddPass(cinn::dialect::ir::CreateLowerCinnFusionOpPass());
   pass_manager.Run(program.get());
 
diff --git a/test/cpp/pir/cinn/jit_instruction_test.cc b/test/cpp/pir/cinn/jit_instruction_test.cc
index 418cad2a7d96e..e13bf1965a592 100644
--- a/test/cpp/pir/cinn/jit_instruction_test.cc
+++ b/test/cpp/pir/cinn/jit_instruction_test.cc
@@ -48,18 +48,18 @@ std::unique_ptr<::pir::Program> BuildProgram() {
 
   const float value = 0.5;
   auto full_op_x =
-      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{2, 2},
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{8, 8},
                                              value,
                                              phi::DataType::FLOAT32,
                                              phi::GPUPlace());
 
   auto full_op_y =
-      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{2, 2},
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{8, 8},
                                              value,
                                              phi::DataType::FLOAT32,
                                              phi::GPUPlace());
   auto full_op_z =
-      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{2, 2},
+      builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{8, 8},
                                              value,
                                              phi::DataType::FLOAT32,
                                              phi::GPUPlace());
@@ -103,6 +103,7 @@ TEST(CinnJitInstruction, Run) {
 
       std::vector<::pir::Operation*> ops = {it};
       auto group = std::make_shared<cinn::hlir::framework::pir::Group>(ops);
+      group->loop_ranges = std::vector<int64_t>{8, 8};
       group->output_values.push_back(it->result(0));
       auto fn_ptr_res = ir_compiler->BuildCUDAJITInfo({group});
       std::unordered_map<std::string, ::pir::Attribute> op_attrs{
diff --git a/test/ir/pir/cinn/CMakeLists.txt b/test/ir/pir/cinn/CMakeLists.txt
index 800a132f6d124..0ff3662fe190c 100644
--- a/test/ir/pir/cinn/CMakeLists.txt
+++ b/test/ir/pir/cinn/CMakeLists.txt
@@ -11,7 +11,8 @@ if(WITH_GPU)
   string(REPLACE ".py" "" CINN_PIR_TEST "${CINN_PIR_TEST}")
 
   # The following UT is enabled manually by add_test
-  list(REMOVE_ITEM CINN_PIR_TEST test_subgraph_checker test_rms_norm test_rope)
+  list(REMOVE_ITEM CINN_PIR_TEST test_subgraph_checker test_rms_norm test_rope
+       test_cinn_ops)
 
   foreach(cinn_pir_test_name ${CINN_PIR_TEST})
     add_test(
@@ -36,6 +37,16 @@ if(WITH_GPU)
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_subgraph_checker PROPERTIES LABELS "RUN_TYPE=CINN")
 
+  add_test(
+    NAME test_cinn_ops
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
+      FLAGS_enable_pir_api=1 FLAGS_group_schedule_tiling_first=1
+      FLAGS_cinn_bucket_compile=True ${PYTHON_EXECUTABLE}
+      ${CMAKE_CURRENT_SOURCE_DIR}/test_cinn_ops.py
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  set_tests_properties(test_subgraph_checker PROPERTIES LABELS "RUN_TYPE=CINN")
   # add_test(
   #   NAME test_rms_norm_seq_len_symbolic
   #   COMMAND
diff --git a/test/ir/pir/cinn/inference/CMakeLists.txt b/test/ir/pir/cinn/inference/CMakeLists.txt
index c5ff7c9573d5e..e75440eecd599 100644
--- a/test/ir/pir/cinn/inference/CMakeLists.txt
+++ b/test/ir/pir/cinn/inference/CMakeLists.txt
@@ -13,7 +13,7 @@ if(WITH_GPU)
         PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
         FLAGS_prim_enable_dynamic=True FLAGS_prim_all=True
         FLAGS_enable_pir_api=1 FLAGS_cinn_bucket_compile=True
-        ${PYTHON_EXECUTABLE}
+        FLAGS_group_schedule_tiling_first=1 ${PYTHON_EXECUTABLE}
         ${CMAKE_CURRENT_SOURCE_DIR}/${cinn_pir_test_name}.py
       WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
     set_tests_properties(${cinn_pir_test_name} PROPERTIES LABELS
diff --git a/test/ir/pir/cinn/inference/test_llama_while.py b/test/ir/pir/cinn/inference/test_llama_while.py
index 0afa041f5baa3..27a241dc016f6 100644
--- a/test/ir/pir/cinn/inference/test_llama_while.py
+++ b/test/ir/pir/cinn/inference/test_llama_while.py
@@ -33,10 +33,9 @@ def __init__(self):
 
     def forward(self, logits, input_ids):
         batch_size, cur_len = paddle.shape(input_ids)
-        unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype="float32")
         max_new_tokens = paddle.full([1], 16, dtype="int64")
         while cur_len < max_new_tokens and paddle.any(unfinished_flag):
-            last_token = input_ids[:, -1]
             # [batch_size, vocab_size]
             probs = F.softmax(logits[:, -1, :])
 
@@ -48,9 +47,9 @@ def forward(self, logits, input_ids):
             )
             _, next_tokens = paddle.tensor.top_p_sampling(probs, top_ps_tensor)
             input_ids = paddle.concat([input_ids, next_tokens], axis=1)
-            paddle.increment(cur_len)
+            cur_len += 1
 
-        return input_ids, last_token
+        return input_ids
 
 
 class TestLlamaPostProcess(unittest.TestCase):
@@ -75,18 +74,15 @@ def eval(self, use_cinn):
         ]
         net = utils.apply_to_static(net, use_cinn, input_spec)
         net.eval()
-        out, _ = net(self.logits, self.input_ids)
-        if use_cinn:
-            self.check_jit_kernel_info(net.forward)
+        out = net(self.logits, self.input_ids)
         return out
 
     def test_eval(self):
         dy_out = self.eval(use_cinn=False)
-        if utils.unittest_use_cinn():
-            cinn_out = self.eval(use_cinn=True)
-            np.testing.assert_allclose(
-                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
-            )
+        cinn_out = self.eval(use_cinn=True)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        )
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/symbolic/CMakeLists.txt b/test/ir/pir/cinn/symbolic/CMakeLists.txt
index 97d918e0832b1..5bd1991ac971b 100644
--- a/test/ir/pir/cinn/symbolic/CMakeLists.txt
+++ b/test/ir/pir/cinn/symbolic/CMakeLists.txt
@@ -21,8 +21,7 @@ if(WITH_GPU)
     test_multiple_subgraph_dy.py
     test_llama_mlp_st.py
     test_llama_mlp_dy.py
-    test_while_st.py
-    test_while_dy.py)
+    test_while_st.py)
 
   foreach(cinn_pir_test_name ${CINN_PIR_SYMBOLIC_TEST})
     string(REGEX REPLACE ".py" "" cinn_pir_test_name ${cinn_pir_test_name})
@@ -217,14 +216,4 @@ if(WITH_GPU)
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_while_st PROPERTIES LABELS "RUN_TYPE=CINN")
 
-  add_test(
-    NAME test_while_dy
-    COMMAND
-      ${CMAKE_COMMAND} -E env
-      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
-      FLAGS_cinn_bucket_compile=True FLAGS_enable_pir_api=1 ${PYTHON_EXECUTABLE}
-      ${CMAKE_CURRENT_SOURCE_DIR}/test_while_dy.py
-    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
-  set_tests_properties(test_while_dy PROPERTIES LABELS "RUN_TYPE=CINN")
-
 endif()
diff --git a/test/ir/pir/cinn/symbolic/test_while_dy.py b/test/ir/pir/cinn/symbolic/test_while_dy.py
index 627d03ab838c5..bb50ef67bdbb6 100644
--- a/test/ir/pir/cinn/symbolic/test_while_dy.py
+++ b/test/ir/pir/cinn/symbolic/test_while_dy.py
@@ -39,6 +39,7 @@ def forward(self, x):
             x = paddle.exp(x) - x
             loop_count += 1
         x = paddle.exp(x)
+
         return x
 
 
@@ -64,17 +65,14 @@ def eval(self, use_cinn):
         net = utils.apply_to_static(net, use_cinn, input_spec)
         net.eval()
         out = net(self.x)
-        if use_cinn:
-            self.check_jit_kernel_info(net.forward)
         return out
 
     def test_eval(self):
         dy_out = self.eval(use_cinn=False)
-        if utils.unittest_use_cinn():
-            cinn_out = self.eval(use_cinn=True)
-            np.testing.assert_allclose(
-                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
-            )
+        cinn_out = self.eval(use_cinn=True)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        )
 
 
 if __name__ == '__main__':
diff --git a/test/ir/pir/cinn/test_cinn_ops.py b/test/ir/pir/cinn/test_cinn_ops.py
index 9e756c23680fd..c2fc0fa0d8a4b 100644
--- a/test/ir/pir/cinn/test_cinn_ops.py
+++ b/test/ir/pir/cinn/test_cinn_ops.py
@@ -67,14 +67,14 @@ def test_eval(self):
         self.check_eval()
 
 
-class TestIsCloseOp(TestOpsBase):
-    def prepare_info(self):
-        self.fn = paddle.isclose
-        self.expected_jit_kernel_number = 1
-        self.expected_jit_kernel_structure = {utils.JIT_KERNEL_NAME: 1}
-
-    def test_eval(self):
-        self.check_eval()
+# class TestIsCloseOp(TestOpsBase):
+#     def prepare_info(self):
+#         self.fn = paddle.isclose
+#         self.expected_jit_kernel_number = 1
+#         self.expected_jit_kernel_structure = {utils.JIT_KERNEL_NAME: 1}
+
+#     def test_eval(self):
+#         self.check_eval()
 
 
 if __name__ == '__main__':

From bc56513ce46c5122d67c544711ef764104ae909d Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Sat, 9 Mar 2024 22:55:20 +0800
Subject: [PATCH 088/114] dist.to_static support pir program (#62560)

* auto_parallel engine build pir program

* skip prepare_op_amp_options in build_program

* add ut

* fix cmake

* remove print
---
 .../dialect/distributed/ir/dist_dialect.cc    |  35 +++++-
 .../auto_parallel/static/dist_input_spec.py   |   3 +
 .../auto_parallel/static/engine.py            |  19 ++-
 python/paddle/jit/dy2static/function_spec.py  |  35 ++++++
 test/auto_parallel/CMakeLists.txt             |   1 +
 test/auto_parallel/pir/CMakeLists.txt         |   5 +
 .../pir/test_to_static_pir_program.py         | 115 ++++++++++++++++++
 7 files changed, 209 insertions(+), 4 deletions(-)
 create mode 100644 test/auto_parallel/pir/CMakeLists.txt
 create mode 100644 test/auto_parallel/pir/test_to_static_pir_program.py

diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
index 4795b09b936e5..4907cf033d560 100644
--- a/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
+++ b/paddle/fluid/pir/dialect/distributed/ir/dist_dialect.cc
@@ -13,11 +13,13 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_dialect.h"
+
 #include "paddle/fluid/pir/dialect/distributed/ir/attribute_storage.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_op.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/type_storage.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
 
 REGISTER_FILE_SYMBOLS(dist_dialect);
 namespace paddle {
@@ -39,7 +41,19 @@ void DistDialect::initialize() {
 void DistDialect::PrintType(pir::Type type, std::ostream &os) const {
   if (auto dist_dense_tensor_type = type.dyn_cast<DistDenseTensorType>()) {
     // Todo: Design the dist dense tensor type print format.
-    os << dist_dense_tensor_type.dense_tensor_type();
+    os << type.dialect().name();
+    os << '.';
+    if (auto tensor_type = type.dyn_cast<pir::DenseTensorType>()) {
+      os << "tensor<";
+      for (auto d : common::vectorize(tensor_type.dims())) {
+        os << d;
+        os << "x";
+      }
+      tensor_type.dtype().Print(os);
+      os << ", ";
+      PrintAttribute(dist_dense_tensor_type.tensor_dist_attr(), os);
+      os << ">";
+    }
   } else {
     os << "error_type!";
   }
@@ -47,10 +61,25 @@ void DistDialect::PrintType(pir::Type type, std::ostream &os) const {
 
 void DistDialect::PrintAttribute(pir::Attribute attr, std::ostream &os) const {
   if (auto process_mesh_attr = attr.dyn_cast<ProcessMeshAttribute>()) {
-    os << process_mesh_attr.process_mesh();
+    os << "mesh: " << process_mesh_attr.process_mesh();
   } else if (auto tensor_dist_attr = attr.dyn_cast<TensorDistAttribute>()) {
     // Todo: Design the tensor dist attr print format.
-    os << tensor_dist_attr.process_mesh_attr().process_mesh();
+    os << "mesh: " << tensor_dist_attr.process_mesh_attr().process_mesh();
+    os << ", dims_mappings: [" +
+              phi::distributed::auto_parallel::str_join(
+                  tensor_dist_attr.dims_mapping()) +
+              "]";
+    if (tensor_dist_attr.partial_status().size() > 0) {
+      std::vector<std::string> partial_status_strs;
+      for (auto &itr : tensor_dist_attr.partial_status()) {
+        std::string s = "partial(" + std::to_string(itr.first) + "," +
+                        phi::ReduceTypeStrings[static_cast<int>(itr.second)] +
+                        ")";
+        partial_status_strs.emplace_back(s);
+      }
+      os << ", "
+         << phi::distributed::auto_parallel::str_join(partial_status_strs);
+    }
   } else {
     os << "error_attribute_type";
   }
diff --git a/python/paddle/distributed/auto_parallel/static/dist_input_spec.py b/python/paddle/distributed/auto_parallel/static/dist_input_spec.py
index 65fc963937ecb..5bb15901f277a 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_input_spec.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_input_spec.py
@@ -29,11 +29,13 @@ def __init__(
         stop_gradient=False,
         mesh=None,
         placements=None,
+        local_shape=None,
     ):
         super().__init__(shape, dtype, name, stop_gradient)
         self.mesh = copy.deepcopy(mesh)
         sharding_specs = get_shard_spec(mesh, placements, len(self.shape))
         self.dims_mapping = convert_to_dims_mapping(sharding_specs, mesh)
+        self.local_shape = local_shape
 
     @classmethod
     def from_dtensor(cls, dtensor, name=None):
@@ -53,6 +55,7 @@ def from_dtensor(cls, dtensor, name=None):
             stop_gradient=dtensor.stop_gradient,
             mesh=dtensor.process_mesh,
             placements=dtensor.placements,
+            local_shape=dtensor._local_value().shape,
         )
 
     def __repr__(self):
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 2215dc9475117..3400ba2dc8983 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -239,6 +239,9 @@ def __init__(
         self._dygraph_mode = False
         self._tuning = self._strategy.tuning
         self._acc_steps = 1
+        self._in_pir_mode = paddle.base.framework.get_flags(
+            "FLAGS_enable_pir_api"
+        )["FLAGS_enable_pir_api"]
         if self._strategy.gradient_merge.enable:
             self._acc_steps = self._strategy.gradient_merge.k_steps
         elif self._strategy.pipeline.enable:
@@ -618,6 +621,9 @@ def _prepare_logger(
     def _prepare_program(self, mode, init_parameters=True):
         # Do the build process
         self._build(mode)
+        # TODO(zhiqiu): fit the processes below for pir
+        if self._in_pir_mode:
+            return
         # Do the planning process
         self._plan(mode)
         # Do the parallel process
@@ -676,7 +682,7 @@ def _build(self, mode):
 
             self._inputs = self.program_helper.input_vars
             self._labels = self.program_helper.label_vars
-            self._process_dist_input_specs()
+            # self._process_dist_input_specs()
             outputs = self.program_helper.output_vars
             self._losses = self.program_helper.loss_vars
             metrics = self.program_helper.metric_vars
@@ -729,6 +735,17 @@ def _build(self, mode):
                 ), "the type of `loss` of the Engine arguments should be Variable."
                 self._losses = auto_utils.to_list(self._loss)
 
+        # TODO(zhiqiu): distributed_context is no longer used in pir_program
+        # so, just return here and need to reimplement the logics below
+        if self._in_pir_mode:
+            if mode != "train":
+                self._fwd_main_progs[mode] = serial_main_prog.clone(
+                    for_test=True
+                )
+            else:
+                self._fwd_main_progs[mode] = serial_main_prog
+            return
+
         default_ctx = get_default_distributed_context()
         if not default_ctx.has_annotation:
             # We build the world process group because the data parallel
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index 2e1752eb8f9f3..65e1b7f4c0481 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -194,6 +194,20 @@ def pir_to_static_inputs_with_spec(self, input_with_spec, main_program):
                         dtype=convert_dtype(var_spec.dtype),
                     )
                     feed_value.stop_gradient = stop_gradient
+
+                    # warp dist tensor
+                    from paddle.distributed.auto_parallel.static.dist_input_spec import (
+                        DistributedInputSpec,
+                    )
+
+                    if isinstance(var_spec, DistributedInputSpec):
+                        dist_dense_tensor_type = paddle.base.libpaddle.pir.create_dist_dense_tensor_type_by_dense_tensor(
+                            feed_value.type(),
+                            var_spec.local_shape,
+                            var_spec.mesh,
+                            var_spec.dims_mapping,
+                        )
+                        feed_value.set_type(dist_dense_tensor_type)
                 else:
                     feed_value = var_spec
                 inputs.append(feed_value)
@@ -225,8 +239,29 @@ def to_static_inputs_with_spec(self, input_with_spec, main_program):
                     need_check_feed=False,
                     stop_gradient=stop_gradient,
                 )
+                # warp dist tensor
+                from paddle.distributed.auto_parallel.static.dist_input_spec import (
+                    DistributedInputSpec,
+                )
+                from paddle.distributed.auto_parallel.static.dist_tensor import (
+                    DistributedTensor,
+                )
+
+                if isinstance(var_spec, DistributedInputSpec):
+                    from paddle.distributed.auto_parallel.static.dist_context import (
+                        get_default_distributed_context,
+                    )
+
+                    default_dist_ctx = get_default_distributed_context()
+                    dist_tensor = DistributedTensor(feed_layer)
+                    dist_tensor.dist_attr.process_mesh = var_spec.mesh
+                    dist_tensor.dist_attr.dims_mapping = var_spec.dims_mapping
+                    dist_tensor.dist_attr.mark_annotated("process_mesh")
+                    dist_tensor.dist_attr.mark_annotated("dims_mapping")
+                    default_dist_ctx.add_dist_tensor_for_program(dist_tensor)
             else:
                 feed_layer = var_spec
+
             inputs.append(feed_layer)
 
         return paddle.utils.pack_sequence_as(input_with_spec, inputs)
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 1d448cb5f6ecb..ca1bd30aa03ae 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -4,6 +4,7 @@
 add_subdirectory(spmd_rules)
 add_subdirectory(hybrid_strategy)
 add_subdirectory(custom_op)
+add_subdirectory(pir)
 
 if(WITH_DISTRIBUTE AND WITH_GPU)
 
diff --git a/test/auto_parallel/pir/CMakeLists.txt b/test/auto_parallel/pir/CMakeLists.txt
new file mode 100644
index 0000000000000..65e827d046313
--- /dev/null
+++ b/test/auto_parallel/pir/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(WITH_DISTRIBUTE AND WITH_GPU)
+  py_test_modules(test_to_static_pir_program MODULES test_to_static_pir_program)
+  set_tests_properties(test_to_static_pir_program
+                       PROPERTIES ENVIRONMENT "FLAGS_enable_pir_api=1")
+endif()
diff --git a/test/auto_parallel/pir/test_to_static_pir_program.py b/test/auto_parallel/pir/test_to_static_pir_program.py
new file mode 100644
index 0000000000000..dc980a6cb8f8d
--- /dev/null
+++ b/test/auto_parallel/pir/test_to_static_pir_program.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+from paddle.distributed import Shard
+from paddle.io import DataLoader
+
+BATCH_SIZE = 4
+BATCH_NUM = 4
+IMAGE_SIZE = 16
+CLASS_NUM = 8
+np.random.seed(2024)
+
+
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, images, labels, num_samples):
+        self.images = images
+        self.labels = labels
+        self.num_samples = num_samples
+
+    def __getitem__(self, idx):
+        return self.images[idx], self.labels[idx]
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DemoNet(nn.Layer):
+    def __init__(self, mesh):
+        super().__init__()
+        self._mesh = mesh
+        self.linear_0 = nn.Linear(IMAGE_SIZE, IMAGE_SIZE)
+        self.linear_1 = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+        self.relu = nn.ReLU()
+        # shard the weights of this layer
+        self.linear_0.weight = dist.shard_tensor(
+            self.linear_0.weight,
+            self._mesh,
+            [Shard(1)],
+            stop_gradient=False,
+        )
+        self.linear_1.weight = dist.shard_tensor(
+            self.linear_1.weight,
+            self._mesh,
+            [Shard(0)],
+            stop_gradient=False,
+        )
+
+    def forward(self, x):
+        out = self.linear_0(x)
+        out = self.relu(out)
+        out = self.linear_1(out)
+        return out
+
+
+def create_data_loader():
+    images = np.random.rand(BATCH_SIZE, IMAGE_SIZE).astype('float32')
+    labels = np.random.rand(BATCH_SIZE, CLASS_NUM).astype('float32')
+    dataset = RandomDataset(images, labels, BATCH_SIZE)
+    loader = DataLoader(dataset, batch_size=BATCH_SIZE)
+    return loader
+
+
+class TestToStaticPirProgram(unittest.TestCase):
+    def test_to_static_program(self):
+        paddle.base.set_flags({'FLAGS_enable_pir_api': 1})
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        layer = DemoNet(mesh)
+        opt = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=layer.parameters()
+        )
+        loss_fn = nn.MSELoss()
+        loader = create_data_loader()
+        dist_loader = dist.shard_dataloader(loader, meshes=[mesh])
+        dist_model = dist.to_static(layer, dist_loader, loss_fn, opt)
+
+        dist_model.train()
+        main_program = dist_model._engine._fwd_main_progs["train"]
+        for op in main_program.global_block().ops:
+            tensor = op.result(0)
+            if op.name() == 'pd_op.data':
+                self.assertTrue(tensor.is_dist_dense_tensor_type())
+                self.assertEqual(tensor.process_mesh.shape, [2])
+                self.assertEqual(tensor.process_mesh.process_ids, [0, 1])
+                self.assertEqual(tensor.dims_mapping, [-1, -1])
+                self.assertEqual(tensor.partial_dims, set())
+            else:
+                self.assertTrue(tensor.is_dense_tensor_type())
+                self.assertFalse(tensor.is_dist_dense_tensor_type())
+
+        # training
+        # dist_model.train()
+        # for batch_id, (image, label) in enumerate(dist_loader()):
+        #     loss = dist_model(image, label)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4117a52c06dbc0e18b24b0eb12854f3876678639 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Sun, 10 Mar 2024 09:27:23 +0800
Subject: [PATCH 089/114] fix group cluster shape dialect bug (#62545)


From 8de49de7f4125d677302ef40838fbbcb4fa6c778 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Sun, 10 Mar 2024 10:15:28 +0800
Subject: [PATCH 090/114] [CINN] EliminateCommonGlobalVar pass, optimize
 performance (#62517)

* [CINN] EliminateCommonGlobalVar pass, optimize performance

* std::cerr->VLOG

* Fix trick codes

* CHECK->PADDLE_ENFORCE

* Fix typo
---
 .../hlir/framework/pir/op_lowering_impl.cc    |   2 +
 paddle/cinn/optim/CMakeLists.txt              |   3 +-
 .../eliminate_common_global_memory_read.cc    | 284 ++++++++++++++++++
 .../eliminate_common_global_memory_read.h     |  28 ++
 4 files changed, 316 insertions(+), 1 deletion(-)
 create mode 100644 paddle/cinn/optim/eliminate_common_global_memory_read.cc
 create mode 100644 paddle/cinn/optim/eliminate_common_global_memory_read.h

diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
index 110616885b768..1ff0a452634ae 100644
--- a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -30,6 +30,7 @@
 #include "paddle/cinn/ir/group_schedule/st_shape_group_scheduler.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/optim/eliminate_common_global_memory_read.h"
 #include "paddle/cinn/optim/schedule_block_dce.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 #include "paddle/common/ddim.h"
@@ -890,6 +891,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
   for (ir::Expr func_body : func_bodies) {
     optim::EliminateDeadScheduleBlock(&(func_body), group->output_names);
 #ifdef CINN_WITH_CUDA
+    optim::EliminateCommonGlobalMemoryRead(&(func_body));
     optim::OptimizeExprGPU(&(func_body));
 #endif
 
diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
index c4935d1a8eecb..36744a516bd95 100755
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -30,7 +30,8 @@ gather_srcs(
   update_buffer_axis_pass.cc
   trans_buffer_with_dynamic_shape.cc
   schedule_block_dce.cc
-  eliminate_common_factor_of_local_index.cc)
+  eliminate_common_factor_of_local_index.cc
+  eliminate_common_global_memory_read.cc)
 
 if(WITH_CUDA)
   gather_srcs(cinnapi_src SRCS transform_gpu_forloop.cc)
diff --git a/paddle/cinn/optim/eliminate_common_global_memory_read.cc b/paddle/cinn/optim/eliminate_common_global_memory_read.cc
new file mode 100644
index 0000000000000..52c0e8cd1bb6f
--- /dev/null
+++ b/paddle/cinn/optim/eliminate_common_global_memory_read.cc
@@ -0,0 +1,284 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/optim/eliminate_common_global_memory_read.h"
+
+#include "paddle/cinn/common/cas.h"
+#include "paddle/cinn/ir/ir_mutator.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/utils/ir_compare.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
+#include "paddle/common/enforce.h"
+
+namespace cinn {
+namespace optim {
+
+namespace {
+
+struct ForVarExtent {
+  ir::Var loop_var;
+  ir::Expr extent;
+};
+
+struct IndicesAndExtent {
+  std::vector<ir::Expr> indices;
+  std::vector<ForVarExtent> for_var_extents;
+};
+
+std::unordered_map<ir::Var, ir::Var> ConstructForVarReplaceMap(
+    const std::vector<ForVarExtent>& lhs_extents,
+    const std::vector<ForVarExtent>& rhs_extents) {
+  std::unordered_map<ir::Var, ir::Var> ret;
+  std::unordered_set<std::size_t> visited_rhs_index;
+  for (const auto& [lhs_var, lhs_extent] : lhs_extents) {
+    for (std::size_t i = 0; i < rhs_extents.size(); ++i) {
+      const auto& [rhs_var, rhs_extent] = rhs_extents[i];
+      if (cinn::common::AutoSimplify(ir::Sub::Make(lhs_extent, rhs_extent)) ==
+              ir::Expr(0) &&
+          visited_rhs_index.count(i) == 0) {
+        ret[lhs_var] = rhs_var;
+        visited_rhs_index.insert(i);
+        break;
+      }
+    }
+  }
+  return ret;
+}
+
+struct GlobalTensorInfoCollector : public ir::IRMutator<Expr*> {
+ public:
+  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+  std::unordered_set<std::string> GetEliminateBufferNames() const {
+    auto IndiceToExprWithForVar =
+        [&](ir::Expr indice,
+            const std::unordered_map<ir::Var, ir::Var>& for_var_map)
+        -> ir::Expr {
+      ir::Expr ret = ir::ir_utils::IRCopy(indice);
+      for (const auto& [lhs_var, rhs_var] : for_var_map) {
+        ReplaceVarWithExpr(&ret, lhs_var, ir::ir_utils::IRCopy(rhs_var));
+      }
+      return ret;
+    };
+
+    auto IndiceAndExtentEqual =
+        [&](const IndicesAndExtent& indice_and_extent1,
+            const IndicesAndExtent& indice_and_extent2) -> bool {
+      const auto& indice1 = indice_and_extent1.indices;
+      const auto& indice2 = indice_and_extent2.indices;
+      if (indice1.size() != indice2.size()) return false;
+
+      std::unordered_map<ir::Var, ir::Var> for_var_map =
+          ConstructForVarReplaceMap(indice_and_extent1.for_var_extents,
+                                    indice_and_extent2.for_var_extents);
+
+      for (size_t i = 0; i < indice1.size(); ++i) {
+        ir::Expr lhs = IndiceToExprWithForVar(indice1.at(i), for_var_map);
+        ir::Expr rhs = IndiceToExprWithForVar(indice2.at(i), for_var_map);
+        if (cinn::common::AutoSimplify(ir::Sub::Make(lhs, rhs)) !=
+            ir::Expr(0)) {
+          return false;
+        }
+      }
+      return true;
+    };
+
+    auto AllIndiceAndExtentEqual =
+        [&](const std::vector<IndicesAndExtent>& indice_and_extent) -> bool {
+      PADDLE_ENFORCE_GE(
+          indice_and_extent.size(),
+          2,
+          ::common::errors::InvalidArgument(
+              "The size of indice_and_extent should greater_equal to 2"));
+      for (size_t i = 1; i < indice_and_extent.size(); ++i) {
+        if (!IndiceAndExtentEqual(indice_and_extent[0], indice_and_extent[i]))
+          return false;
+      }
+      return true;
+    };
+
+    auto IsGlobalTensorNeedEliminate =
+        [&](const std::vector<IndicesAndExtent>& indice_and_extent) -> bool {
+      if (indice_and_extent.size() <= 1) return false;
+      return AllIndiceAndExtentEqual(indice_and_extent);
+    };
+
+    std::unordered_set<std::string> global_buffer_name;
+    for (const auto& [buffer_name, indice_and_extent] :
+         buffer_to_indice_and_extent_) {
+      if (IsGlobalTensorNeedEliminate(indice_and_extent)) {
+        global_buffer_name.insert(buffer_name);
+      }
+    }
+    return global_buffer_name;
+  }
+
+ private:
+  void Visit(const ir::ScheduleBlockRealize* op, ir::Expr* expr) override {
+    const auto* sbr_node = expr->As<ir::ScheduleBlockRealize>();
+    CHECK(sbr_node);
+    const auto& iter_values = sbr_node->iter_values;
+    const auto* sb_node = sbr_node->schedule_block.As<ir::ScheduleBlock>();
+    const auto& iter_vars = sb_node->iter_vars;
+    PADDLE_ENFORCE_EQ(
+        iter_values.size(),
+        iter_vars.size(),
+        ::common::errors::InvalidArgument(
+            "The size of iter_values should equal to the size of iter_vars, as "
+            "they comes from the same ScheduleBlockRealize"));
+
+    for (std::size_t i = 0; i < iter_values.size(); ++i) {
+      var_to_sb_expr_[iter_vars[i]] = iter_values[i];
+    }
+    ir::IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::For* op, ir::Expr* expr) override {
+    auto* node = expr->As<ir::For>();
+    CHECK(node);
+    for_var_extents_.push_back(
+        {node->loop_var, ir::ir_utils::IRCopy(node->extent)});
+    ir::IRMutator<>::Visit(op, expr);
+    for_var_extents_.pop_back();
+  }
+
+  void Visit(const ir::Load* op, ir::Expr* expr) override {
+    auto* node = expr->As<ir::Load>();
+    CHECK(node);
+    const auto& load_buffer = node->tensor.as_tensor_ref()->buffer;
+    if (load_buffer->memory_type == ir::MemoryType::Heap) {
+      std::vector<ir::Expr> tensor_indices;
+      for (const auto& indice : node->indices) {
+        ir::Expr new_indice = ir::ir_utils::IRCopy(indice);
+        for (const auto& [var, sb_expr] : var_to_sb_expr_) {
+          ReplaceVarWithExpr(&new_indice, var, ir::ir_utils::IRCopy(sb_expr));
+        }
+        tensor_indices.push_back(new_indice);
+      }
+      buffer_to_indice_and_extent_[load_buffer->name].push_back(
+          {tensor_indices, for_var_extents_});
+    }
+  }
+
+  std::vector<ForVarExtent> for_var_extents_;
+  std::unordered_map<ir::Var, ir::Expr> var_to_sb_expr_;
+  std::unordered_map<std::string, std::vector<IndicesAndExtent>>
+      buffer_to_indice_and_extent_;
+};
+
+struct CommonGlobalMemoryEliminator : public ir::IRMutator<Expr*> {
+  CommonGlobalMemoryEliminator(
+      const std::unordered_set<std::string>& eliminate_buffer_names)
+      : eliminate_buffer_names_(eliminate_buffer_names) {}
+
+  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
+
+ private:
+  void Visit(const ir::Block* op, Expr* expr) override {
+    auto* node = expr->As<ir::Block>();
+    CHECK(node);
+    current_block_ = node;
+    IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::ScheduleBlockRealize* op, Expr* expr) override {
+    auto* node = expr->As<ir::ScheduleBlockRealize>();
+    CHECK(node);
+    current_sbr_ = node;
+    IRMutator<>::Visit(op, expr);
+  }
+
+  void Visit(const ir::Load* op, Expr* expr) override {
+    auto* node = expr->As<ir::Load>();
+    CHECK(node);
+    const auto& buffer_name = node->tensor.as_tensor_ref()->buffer->name;
+    if (eliminate_buffer_names_.count(buffer_name) == 0) {
+      return;
+    }
+
+    if (global_buffer_to_local_buffer_.count(buffer_name) == 0) {
+      InsertLocalTensorBlock(node, buffer_name);
+    }
+    SubstituteGlobalTensor(node, buffer_name);
+  }
+
+  void InsertLocalTensorBlock(ir::Load* load_node,
+                              const std::string& buffer_name) {
+    ir::Expr sb = ir::ir_utils::IRCopy(current_sbr_->schedule_block);
+    ir::ScheduleBlock* sb_node = sb.As<ir::ScheduleBlock>();
+    CHECK(sb_node);
+
+    const auto& old_tensor = load_node->tensor.as_tensor_ref();
+    ir::Expr new_tensor =
+        ir::_Tensor_::Make(old_tensor->name + "_local",
+                           old_tensor->type(),
+                           ir::ir_utils::IRCopy(old_tensor->shape),
+                           ir::ir_utils::IRCopy(old_tensor->domain),
+                           old_tensor->reduce_axis);
+    new_tensor.as_tensor_ref()->WithBuffer(
+        "local", new_tensor.as_tensor_ref()->name + "_buffer");
+    ir::Expr new_body =
+        ir::Store::Make(new_tensor,
+                        ir::ir_utils::IRCopy(ir::Expr(load_node)),
+                        ir::ir_utils::IRCopy(load_node->indices));
+    ir::Expr new_sb = ir::ScheduleBlock::Make(
+        sb_node->iter_vars, {}, {}, sb_node->name + "_local", new_body);
+
+    ir::Expr new_sbr = ir::ScheduleBlockRealize::Make(
+        ir::ir_utils::IRCopy(current_sbr_->iter_values), new_sb);
+    PADDLE_ENFORCE_EQ(
+        global_buffer_to_local_buffer_.count(buffer_name),
+        0,
+        ::common::errors::InvalidArgument(
+            "buffer_name %s should not be in global_buffer_to_local_buffer_",
+            buffer_name));
+    global_buffer_to_local_buffer_[buffer_name] = new_tensor;
+    current_block_->stmts.insert(current_block_->stmts.begin(), new_sbr);
+  }
+
+  void SubstituteGlobalTensor(ir::Load* load_node,
+                              const std::string& buffer_name) {
+    PADDLE_ENFORCE_GT(
+        global_buffer_to_local_buffer_.count(buffer_name),
+        0,
+        ::common::errors::InvalidArgument(
+            "global_buffer_to_local_buffer_ should contain buffer_name %s",
+            buffer_name));
+    load_node->tensor = global_buffer_to_local_buffer_[buffer_name];
+  }
+
+  std::unordered_set<std::string> eliminate_buffer_names_;
+  std::unordered_map<std::string, ir::Expr> global_buffer_to_local_buffer_;
+
+  ir::Block* current_block_;
+  ir::ScheduleBlockRealize* current_sbr_;
+};
+
+}  // namespace
+
+void EliminateCommonGlobalMemoryRead(Expr* e) {
+  VLOG(4) << "Before EliminateCommonGlobalMemoryRead: \n" << *e;
+  GlobalTensorInfoCollector collector;
+  collector(e);
+
+  const auto& eliminate_buffer_names = collector.GetEliminateBufferNames();
+
+  CommonGlobalMemoryEliminator eliminator(eliminate_buffer_names);
+  eliminator(e);
+  VLOG(4) << "After EliminateCommonGlobalMemoryRead: \n" << *e;
+}
+
+}  // namespace optim
+}  // namespace cinn
diff --git a/paddle/cinn/optim/eliminate_common_global_memory_read.h b/paddle/cinn/optim/eliminate_common_global_memory_read.h
new file mode 100644
index 0000000000000..0db44e2b25444
--- /dev/null
+++ b/paddle/cinn/optim/eliminate_common_global_memory_read.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/ir/ir.h"
+
+namespace cinn {
+namespace optim {
+
+/**
+ * Remove common global memory read and substitue them with local memory read.
+ */
+void EliminateCommonGlobalMemoryRead(Expr* e);
+
+}  // namespace optim
+}  // namespace cinn

From 72c4f15ba346e9642eade296910c9c8d26e77a38 Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Sun, 10 Mar 2024 10:19:30 +0800
Subject: [PATCH 091/114] fix dyshape buffer resize (#62490)

* fix dyshape buffer resize

* add flags in cmake of unittest

* remove flags in unittest cmake

* delete excess free stmt
---
 paddle/cinn/backends/codegen_cuda_dev.cc               | 2 ++
 test/ir/pir/cinn/symbolic/CMakeLists.txt               | 6 ++++--
 test/ir/pir/cinn/symbolic/test_multiple_subgraph_dy.py | 4 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index eb70ebe8fff8e..aa58470ef93de 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -21,6 +21,7 @@
 #include <set>
 #include <unordered_set>
 
+#include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_verify.h"
@@ -124,6 +125,7 @@ std::vector<Expr> FilterDeallocTempBuffers(const std::vector<Expr> &frees) {
     bool has_symbolic_constant = false;
     const ir::_Buffer_ *buffer = op->destination.As<ir::_Buffer_>();
     for (Expr shape : buffer->shape) {
+      shape = common::AutoSimplify(shape);
       ir::ir_utils::CollectIRNodes(shape, [&](const Expr *x) {
         if (x->as_var()) {
           CHECK(x->as_var()->is_symbolic_constant)
diff --git a/test/ir/pir/cinn/symbolic/CMakeLists.txt b/test/ir/pir/cinn/symbolic/CMakeLists.txt
index 5bd1991ac971b..728d4f15dc5e6 100644
--- a/test/ir/pir/cinn/symbolic/CMakeLists.txt
+++ b/test/ir/pir/cinn/symbolic/CMakeLists.txt
@@ -166,7 +166,8 @@ if(WITH_GPU)
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
       FLAGS_cinn_convert_static_dim_to_dynamic_dim=2048:S0
-      FLAGS_cinn_bucket_compile=True FLAGS_enable_pir_api=1 ${PYTHON_EXECUTABLE}
+      FLAGS_group_schedule_tiling_first=1 FLAGS_cinn_bucket_compile=True
+      FLAGS_enable_pir_api=1 ${PYTHON_EXECUTABLE}
       ${CMAKE_CURRENT_SOURCE_DIR}/test_multiple_subgraph_st.py
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_multiple_subgraph_st PROPERTIES LABELS
@@ -177,7 +178,8 @@ if(WITH_GPU)
     COMMAND
       ${CMAKE_COMMAND} -E env
       PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
-      FLAGS_cinn_bucket_compile=True FLAGS_enable_pir_api=1 ${PYTHON_EXECUTABLE}
+      FLAGS_group_schedule_tiling_first=1 FLAGS_cinn_bucket_compile=True
+      FLAGS_enable_pir_api=1 ${PYTHON_EXECUTABLE}
       ${CMAKE_CURRENT_SOURCE_DIR}/test_multiple_subgraph_dy.py
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_multiple_subgraph_dy PROPERTIES LABELS
diff --git a/test/ir/pir/cinn/symbolic/test_multiple_subgraph_dy.py b/test/ir/pir/cinn/symbolic/test_multiple_subgraph_dy.py
index b8dcee9e00605..6ebcad30f5623 100644
--- a/test/ir/pir/cinn/symbolic/test_multiple_subgraph_dy.py
+++ b/test/ir/pir/cinn/symbolic/test_multiple_subgraph_dy.py
@@ -81,5 +81,5 @@ def test_eval(self):
         )
 
 
-# if __name__ == '__main__':
-#     unittest.main()
+if __name__ == '__main__':
+    unittest.main()

From 6c2378f163bdaa5721a2fa258449bb90993fe17f Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Sun, 10 Mar 2024 14:49:34 +0800
Subject: [PATCH 092/114] cinn(op): add fill constant symblic compute (#62478)

---
 paddle/cinn/hlir/op/elementwise.cc | 3 +--
 paddle/cinn/hlir/op/op_util.cc     | 9 +++++++++
 paddle/cinn/hlir/op/op_util.h      | 3 +++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/paddle/cinn/hlir/op/elementwise.cc b/paddle/cinn/hlir/op/elementwise.cc
index fc93d9f206684..19201a623baaf 100644
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -533,8 +533,7 @@ std::shared_ptr<OpStrategy> StrategyForFillConstantSymbolic(
         CHECK(!args.empty()) << "The input argument of fill_constant compute "
                                 "is empty! Please check.";
         bool force_cpu = false;
-        CHECK(attrs.attr_store.count("shape"));
-        auto shape = absl::get<std::vector<int>>(attrs.attr_store.at("shape"));
+        auto shape = output_shapes[0];
         CHECK(attrs.attr_store.count("value"));
         auto value = GetScalarExpr(attrs.attr_store.at("value"));
         CHECK(attrs.attr_store.count("force_cpu"));
diff --git a/paddle/cinn/hlir/op/op_util.cc b/paddle/cinn/hlir/op/op_util.cc
index 6cad9f4cb75f1..cddbbba8cf14a 100644
--- a/paddle/cinn/hlir/op/op_util.cc
+++ b/paddle/cinn/hlir/op/op_util.cc
@@ -144,5 +144,14 @@ std::string GetExternFuncName(const cinn::common::Target& target,
   return func_proto_name;
 }
 
+std::vector<Expr> ToCinnExprs(const std::vector<ir::Dim>& args) {
+  std::vector<Expr> exprs;
+  std::transform(args.begin(),
+                 args.end(),
+                 std::back_inserter(exprs),
+                 [](const ir::Dim& arg) { return arg->dim_expr; });
+  return exprs;
+}
+
 }  // namespace hlir
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/op/op_util.h b/paddle/cinn/hlir/op/op_util.h
index a0521e26f1b72..5c946239c835c 100644
--- a/paddle/cinn/hlir/op/op_util.h
+++ b/paddle/cinn/hlir/op/op_util.h
@@ -20,6 +20,7 @@
 
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/ir/dim.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/lang/packed_func.h"
 #include "paddle/cinn/utils/type_defs.h"
@@ -60,6 +61,8 @@ std::vector<Expr> ToCinnExprs(const std::vector<T> &args) {
   return exprs;
 }
 
+std::vector<Expr> ToCinnExprs(const std::vector<ir::Dim> &args);
+
 template <typename T>
 std::vector<T> ToPodVector(const std::vector<Expr> &args) {
   if (args.empty()) {

From d27c2ea30d7d68eb2eddaedabe3e8f9c3a57fb06 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@qq.com>
Date: Sun, 10 Mar 2024 15:45:46 +0800
Subject: [PATCH 093/114] cinn(op): add broadcast compute (#62488)

---
 paddle/cinn/hlir/op/broadcast.cc |  7 +------
 paddle/cinn/hlir/pe/broadcast.cc | 25 +++++++------------------
 paddle/cinn/hlir/pe/broadcast.h  |  1 -
 3 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/paddle/cinn/hlir/op/broadcast.cc b/paddle/cinn/hlir/op/broadcast.cc
index c6c7ee00a9449..444a6f69c5d52 100644
--- a/paddle/cinn/hlir/op/broadcast.cc
+++ b/paddle/cinn/hlir/op/broadcast.cc
@@ -307,12 +307,7 @@ std::shared_ptr<OpStrategy> StrategyForBroadcastToSymbolic(
                  output_shapes[0].end(),
                  out_shape.begin(),
                  [](const ir::Dim &dim) { return dim->dim_expr; });
-  std::vector<int> broadcast_axes;
-  CHECK_GT(attrs.attr_store.count("broadcast_axes"), 0);
-  broadcast_axes =
-      absl::get<std::vector<int>>(attrs.attr_store.at("broadcast_axes"));
   VLOG(3) << "broadcast out shape: " << utils::Join(out_shape, ", ");
-  VLOG(3) << "broadcast_axes shape: " << utils::Join(broadcast_axes, ", ");
 
   framework::CINNCompute broadcast_to_compute([=](lang::Args args,
                                                   lang::RetValue *ret) {
@@ -328,7 +323,7 @@ std::shared_ptr<OpStrategy> StrategyForBroadcastToSymbolic(
     Expr A_expr = pack_args[0];
     CHECK(A_expr.as_tensor());
     ir::Tensor A = A_expr.as_tensor_ref();
-    auto out = pe::BroadcastTo(A, out_shape, broadcast_axes, tensor_name);
+    auto out = pe::BroadcastTo(A, out_shape, tensor_name);
     auto stages = CreateStages({A, out});
     *ret = CINNValuePack{{CINNValue(out), CINNValue(stages)}};
   });
diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc
index 29189a5b1987c..9ab00fc8ce5da 100644
--- a/paddle/cinn/hlir/pe/broadcast.cc
+++ b/paddle/cinn/hlir/pe/broadcast.cc
@@ -374,36 +374,25 @@ Tensor BroadcastTo(const Tensor& A,
 
 Tensor BroadcastTo(const Tensor& A,
                    const std::vector<ir::Expr>& out_shape,
-                   const std::vector<int>& broadcast_axes,
                    const std::string& out_name) {
   auto A_shape = A->shape;
-  CHECK_EQ(A_shape.size(), broadcast_axes.size())
-      << "broadcast_axes's size should be same with the input shape's size";
-  CHECK_GE(out_shape.size(), broadcast_axes.size())
-      << "broadcast_axes's size should be no more than out_shape's size";
-  auto axes = broadcast_axes;
-  for (auto& axis : axes) {
-    // if axis < 0, plus out_shape.size
-    if (axis < 0) {
-      axis = out_shape.size() + axis;
-    }
-    CHECK_LT(axis, out_shape.size());
-  }
-  std::sort(axes.begin(), axes.end());
+  CHECK_EQ(A_shape.size(), out_shape.size())
+      << "broadcast_to's out_shape's size should be same with the input "
+         "shape's size";
 
   return Compute(
       ToCinnExprs(out_shape),
       [=](const std::vector<Expr>& indice) {
         std::vector<Expr> broadcast_indice;
-        for (int idx = 0; idx < axes.size(); ++idx) {
+        for (int idx = 0; idx < out_shape.size(); ++idx) {
           ir::Expr a_shape_i = A_shape[idx];
           if (MathEqual(a_shape_i, ir::Expr(1))) {
             broadcast_indice.push_back(ir::Expr(0));
-          } else if (MathEqual(a_shape_i, out_shape[axes[idx]])) {
-            broadcast_indice.push_back(indice[axes[idx]]);
+          } else if (MathEqual(a_shape_i, out_shape[idx])) {
+            broadcast_indice.push_back(indice[idx]);
           } else {
             LOG(FATAL) << "fail to broad cast input shape " << a_shape_i
-                       << " to output shape " << out_shape[axes[idx]];
+                       << " to output shape " << out_shape[idx];
           }
         }
         return A(broadcast_indice);
diff --git a/paddle/cinn/hlir/pe/broadcast.h b/paddle/cinn/hlir/pe/broadcast.h
index efdafee9c9dce..f2cb2649ad499 100644
--- a/paddle/cinn/hlir/pe/broadcast.h
+++ b/paddle/cinn/hlir/pe/broadcast.h
@@ -118,7 +118,6 @@ ir::Tensor BroadcastTo(
 ir::Tensor BroadcastTo(
     const ir::Tensor& A,
     const std::vector<ir::Expr>& out_shape,
-    const std::vector<int>& broadcast_axes,
     const std::string& out_name = cinn::common::UniqName("T_broadcast_to_out"));
 
 // This operator checks if all x and y satisfy the condition: |x - y| <= atol +

From 00266ae3638cb5ebbe1e3f9b6aa510b1d4d997fa Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Sun, 10 Mar 2024 15:54:47 +0800
Subject: [PATCH 094/114] [Dynamic Shape]Fix
 SubstituteDimExprBasedOnConstraintsPass invalid bug (#62570)

* [Dynamic Shape]Fix SubstituteDimExprBasedOnConstraintsPass invalid bug

* simplify substituted dim_expr
---
 ...tute_dim_expr_based_on_constraints_pass.cc | 71 +++++++++++--------
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
index bb6a3bbf23bbf..da2b2dda74deb 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
@@ -19,6 +19,7 @@
 #include "paddle/cinn/common/dim_expr_util.h"
 #include "paddle/cinn/common/union_find.h"
 #include "paddle/pir/include/dialect/shape/ir/shape_attribute.h"
+#include "paddle/pir/include/dialect/shape/utils/dim_expr_simplify.h"
 
 namespace cinn {
 namespace dialect {
@@ -27,26 +28,19 @@ namespace ir {
 namespace {
 
 template <typename DoEachT>
-void VisitEachOp(pir::Operation* op, const DoEachT& DoEach) {
-  for (uint32_t i = 0; i < op->num_regions(); i++) {
-    for (pir::Block& block : op->region(i)) {
-      for (pir::Operation& sub_op : block) {
-        DoEach(sub_op);
-        if (sub_op.num_regions() > 0) {
-          VisitEachOp(&sub_op, DoEach);
-        }
-      }
-    }
+void VisitEachOp(cinn::dialect::GroupOp op, const DoEachT& DoEach) {
+  for (pir::Operation* sub_op : op.GetOperators()) {
+    DoEach(sub_op);
   }
 }
 
 template <typename DoEachT>
-void VisitEachValue(const pir::Operation& op, const DoEachT& DoEach) {
-  for (std::size_t i = 0; i < op.num_operands(); ++i) {
-    DoEach(op.operand_source(i));
+void VisitEachValue(const pir::Operation* op, const DoEachT& DoEach) {
+  for (std::size_t i = 0; i < op->num_operands(); ++i) {
+    DoEach(op->operand_source(i));
   }
-  for (std::size_t i = 0; i < op.num_results(); ++i) {
-    DoEach(op.result(i));
+  for (std::size_t i = 0; i < op->num_results(); ++i) {
+    DoEach(op->result(i));
   }
 }
 
@@ -60,8 +54,9 @@ symbol::TensorShapeOrDataDimExprs SubstituteTensorShapeOrData(
              substitution_pattern) -> std::vector<symbol::DimExpr> {
     std::vector<symbol::DimExpr> substituted_dim_expr{};
     for (const symbol::DimExpr& dim_expr : original_dim_expr) {
-      substituted_dim_expr.push_back(
-          cinn::common::SubstituteDimExpr(dim_expr, substitution_pattern));
+      const auto& tmp_dim_expr =
+          cinn::common::SubstituteDimExpr(dim_expr, substitution_pattern);
+      substituted_dim_expr.push_back(symbol::SimplifyDimExpr(tmp_dim_expr));
     }
     return substituted_dim_expr;
   };
@@ -99,6 +94,22 @@ symbol::ShapeOrDataDimExprs SubstituteShapeOrData(
   return std::visit(lambdas, shape_or_data.variant());
 }
 
+int GetDimExprPriority(const symbol::DimExpr& dim_expr) {
+  return std::visit(
+      symbol::Overloaded{
+          [&](std::int64_t) { return 0; },
+          [&](const std::string&) { return 1; },
+          [&](const symbol::Negative<symbol::DimExpr>&) { return 2; },
+          [&](const symbol::Reciprocal<symbol::DimExpr>&) { return 2; },
+          [&](const symbol::Add<symbol::DimExpr>&) { return 2; },
+          [&](const symbol::Mul<symbol::DimExpr>&) { return 2; },
+          [&](const symbol::Max<symbol::DimExpr>&) { return 2; },
+          [&](const symbol::Min<symbol::DimExpr>&) { return 2; },
+          [&](const symbol::Broadcast<symbol::DimExpr>&) { return 2; },
+      },
+      dim_expr.variant());
+}
+
 std::unordered_map<symbol::DimExpr, symbol::DimExpr> GetDimExprSubstitution(
     pir::ShapeConstraintIRAnalysis* shape_analysis) {
   const std::vector<symbol::DimExprConstraint>& dim_expr_constraints =
@@ -123,9 +134,8 @@ std::unordered_map<symbol::DimExpr, symbol::DimExpr> GetDimExprSubstitution(
     CHECK(!dim_expr_cluster.empty());
     auto dim_expr_root = dim_expr_cluster[0];
     for (const auto& dim_expr : dim_expr_cluster) {
-      if (std::holds_alternative<std::int64_t>(dim_expr)) {
+      if (GetDimExprPriority(dim_expr) < GetDimExprPriority(dim_expr_root)) {
         dim_expr_root = dim_expr;
-        break;
       }
     }
     for (const auto& dim_expr : dim_expr_cluster) {
@@ -137,40 +147,39 @@ std::unordered_map<symbol::DimExpr, symbol::DimExpr> GetDimExprSubstitution(
   return substitution_pattern;
 }
 
-void SubstituteDimExprBasedOnConstraints(pir::Operation* module_op) {
+void SubstituteDimExprBasedOnConstraints(pir::Operation* op) {
   VLOG(4) << "SubstituteDimExprBasedOnConstraints start";
+  auto group_op = op->dyn_cast<cinn::dialect::GroupOp>();
   pir::ShapeConstraintIRAnalysis* shape_analysis =
-      &pir::ShapeAnalysisManager::Instance().Get(
-          module_op->dyn_cast<pir::ModuleOp>().program());
+      &pir::ShapeAnalysisManager::Instance().Get(group_op->GetParentProgram());
   const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
       substitution_pattern = GetDimExprSubstitution(shape_analysis);
 
-  VisitEachOp(module_op, [&](pir::Operation& op) {
+  VisitEachOp(group_op, [&](pir::Operation* op) {
     VisitEachValue(op, [&](pir::Value value) {
       if (!shape_analysis->HasShapeOrDataForValue(value)) {
-        VLOG(4) << "Can not find ShapeOrData for value of op(" << op.name()
+        VLOG(4) << "Can not find ShapeOrData for value of op(" << op->name()
                 << ") in shape_analysis";
       } else {
         const symbol::ShapeOrDataDimExprs& origin_shape_or_data =
             shape_analysis->GetShapeOrDataForValue(value);
-        VLOG(8) << op.name()
+        VLOG(8) << op->name()
                 << "      origin_shape_or_data: " << origin_shape_or_data;
         const symbol::ShapeOrDataDimExprs& substituted_shape_or_data =
             SubstituteShapeOrData(origin_shape_or_data, substitution_pattern);
-        VLOG(8) << op.name()
+        VLOG(8) << op->name()
                 << " substituted_shape_or_data: " << substituted_shape_or_data;
         shape_analysis->SetShapeOrDataForValue(value,
                                                substituted_shape_or_data);
       }
     });
-    if (op.num_results() > 0) {
+    if (op->num_results() > 0) {
       pir::shape::SetShapeAttrForOp(
-          &op, shape_analysis->GetShapeOrDataForValue(op.result(0)));
+          op, shape_analysis->GetShapeOrDataForValue(op->result(0)));
     } else {
       pir::shape::SetShapeAttrForOp(
-          &op, shape_analysis->GetShapeOrDataForValue(op.operand_source(0)));
+          op, shape_analysis->GetShapeOrDataForValue(op->operand_source(0)));
     }
-    // TODO(JiaWenxuan): substitute the attribute "sym_shape_str" of the op
   });
   VLOG(4) << "SubstituteDimExprBasedOnConstraints end";
 }
@@ -185,7 +194,7 @@ class SubstituteDimExprBasedOnConstraintsPass : public pir::Pass {
   }
 
   bool CanApplyOn(pir::Operation* op) const override {
-    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
+    return op->isa<cinn::dialect::GroupOp>() && op->num_regions() > 0;
   }
 };
 

From 24178136d9a12d0e779701094fc2800b0068e235 Mon Sep 17 00:00:00 2001
From: jiahy0825 <jiahongyu@baidu.com>
Date: Sun, 10 Mar 2024 09:26:12 +0000
Subject: [PATCH 095/114] ReversedInferShardableAxes support sinks

---
 paddle/cinn/frontend/group_pattern_util.cc | 28 ++++++++++++++++------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/paddle/cinn/frontend/group_pattern_util.cc b/paddle/cinn/frontend/group_pattern_util.cc
index c5660222cf0af..44d757a1ab867 100644
--- a/paddle/cinn/frontend/group_pattern_util.cc
+++ b/paddle/cinn/frontend/group_pattern_util.cc
@@ -208,13 +208,18 @@ ShardableAxesSignature MakeShardableAxesSignature4Op(const pir::Operation* op) {
   LOG(FATAL) << "Dead code";
 }
 
+template<typename InputIt>
 std::unordered_map<pir::Value, ShardableAxes> ReversedInferShardableAxes(
-    common::TopoWalker<const pir::Operation*>& reversed_walker,
-    const pir::Operation* sink,
-    const ShardableAxes& init_sa) {
-  std::unordered_map<pir::Value, ShardableAxes> value2shardable_axes{
-    {sink->result(0), init_sa}
-  };
+    const common::TopoWalker<const pir::Operation*>& reversed_walker,
+    InputIt sink_and_init_begin, InputIt sink_and_init_end) {
+  std::unordered_map<pir::Value, ShardableAxes> value2shardable_axes;
+  std::list<const pir::Operation*> sinks;
+  for (auto iter = sink_and_init_begin; iter != sink_and_init_end; ++iter) {
+    const pir::Operation* sink = iter->first;
+    CHECK_EQ(sink->num_results(), 1);
+    sinks.push_back(sink);
+    value2shardable_axes[sink->result(0)] = iter->second;
+  }
   const auto& UpdateValue2ShardableAxes = [&](pir::Value value, const ShardableAxes& sa) {
     auto iter = value2shardable_axes.find(value);
     if (iter != value2shardable_axes.end()) {
@@ -223,7 +228,7 @@ std::unordered_map<pir::Value, ShardableAxes> ReversedInferShardableAxes(
       iter->second = sa;
     }
   };
-  reversed_walker(sink, [&](const auto* op){
+  reversed_walker(sinks.begin(), sinks.end(), [&](const auto* op){
     auto shardable_axes_sig = MakeShardableAxesSignature4Op(op);
     const auto& old2new = ShardableAxesUtil::GetOldName2NewName(shardable_axes_sig.output_shardable_axes,
                                               value2shardable_axes.at(op->result(0)));
@@ -239,6 +244,15 @@ std::unordered_map<pir::Value, ShardableAxes> ReversedInferShardableAxes(
   return value2shardable_axes;
 }
 
+std::unordered_map<pir::Value, ShardableAxes> ReversedInferShardableAxes(
+    const common::TopoWalker<const pir::Operation*>& reversed_walker,
+    const pir::Operation* sink,
+    const ShardableAxes& init_sa) {
+  using OpAndInitValue = std::pair<const pir::Operation*, ShardableAxes>;
+  std::array<OpAndInitValue, 1> sinks{OpAndInitValue{sink, init_sa}};
+  return ReversedInferShardableAxes(reversed_walker, sinks.begin(), sinks.end());
+}
+
 common::TopoWalker<const pir::Operation*> GetOpsTopoWalker(const std::unordered_set<const pir::Operation*>& ops) {
   const auto* ops_set = &ops;
   const auto VisitUpStreamInOps = [ops_set](const pir::Operation* op, const OpVisitor& DoEach) {

From b8e79397f8f896207bada0c3a4df95a9c99ae40b Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Sun, 10 Mar 2024 09:39:29 +0000
Subject: [PATCH 096/114] update op lower

---
 paddle/cinn/hlir/framework/pir/trivial_op.cc | 165 ++++++++++++++++++-
 1 file changed, 164 insertions(+), 1 deletion(-)

diff --git a/paddle/cinn/hlir/framework/pir/trivial_op.cc b/paddle/cinn/hlir/framework/pir/trivial_op.cc
index aaba127989b40..16f3c9f76786d 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op.cc
+++ b/paddle/cinn/hlir/framework/pir/trivial_op.cc
@@ -32,6 +32,8 @@
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/pir/include/dialect/control_flow/ir/cf_op.h"
 
+// #include "paddle/cinn/frontend/group_pattern_util.h"
+
 namespace cinn {
 namespace hlir {
 namespace framework {
@@ -280,12 +282,166 @@ ir::Expr TrivialFusion(ir::Expr upper, ir::Expr down) {
 struct FusionNode {
   // Function bodies losses the kind information which needed in trivialop
   // fusion.
-  ir::Expr op_compute_body;
+  std::vector<ir::Expr> op_compute_body;
   OpPatternKind op_pattern;
+
+  std::vector<::pir::Operator*> output_ops;
+
+  std::unordered_map<FusionNode*, pir::Value> upstream;
+  std::unordered_map<FusionNode*, pir::Value> downstream;
+
   explicit FusionNode(ir::Expr op_compute_body, OpPatternKind op_pattern)
       : op_compute_body(op_compute_body), op_pattern(op_pattern) {}
+
+  void init_topo_info(FusionNode* upstream_node, FusionNode* downstream_node){
+    upstream.insert(upstream.end(), upstream_node.upstream.begin(), upstream_node.upstream.end());
+    upstream.insert(upstream.end(), downstream_node.upstream.begin(), downstream_node.upstream.end());
+    upstream.erase(upstream_node);
+
+    downstream.insert(downstream.end(), upstream_node.upstream.begin(), upstream_node.upstream.end());
+    downstream.insert(downstream.end(), downstream_node.upstream.begin(), downstream_node.upstream.end());
+    downstream.erase(downstream_node);
+
+    output_ops.insert(output_ops.end(), upstream_node.output_ops.begin(), upstream_node.output_ops.end());
+    output_ops.insert(output_ops.end(), downstream_node.output_ops.begin(), downstream_node.output_ops.end());
+    upstream_node->downstream[downstream_node].defining_op();
+    output_ops.erase();
+  }
+
 };
 
+struct FusionGraph {
+
+  explicit FusionGraph(
+      const std::vector<::pir::Operation*>& ops,
+      const std::vector<ir::Expr>& op_compute_bodies){
+
+    // shardable_axes_ = InferShardableAxes(ops);
+
+    const auto& op_patterns = trivial_fusion_detail::GetOpPatternKindVector(ops);
+    trivial_fusion_detail::CheckFusionInputValid(op_compute_bodies, op_patterns);
+
+    std::unordered_map<::pir::Operation*, FusionNode*> op_to_node_map;
+
+    for (int i=0; i<ops.size(); ++i){
+      if (ops[i]->isa<pir::YieldOp()>)
+        continue;
+      FusionNode* node = new FusionNode(op_compute_bodies[i], op_patterns[i]);
+      op_to_node_map[ops[i]] = node;
+      all_fusion_nodes_.emplace(node);
+      node->output_op.emplace_back(ops[i]);
+    }
+
+    for (const ::pir::Operation* op : ops){
+      if (op->isa<pir::YieldOp()>)
+        continue;
+      FusionNode* node = op_to_node_map[op];
+
+      // add upstream nodes
+      for (int i = 0; i < op->num_operands(); ++i){
+        pir::Value input_value = op->operand_source(i);
+        const ::pir::Operation* input_op = input_value.defining_op();
+        if (op_to_node_map.find(input_op) != op_to_node_map.end()){
+          node->upstream[op_to_node_map[input_op]] = input_value;
+        }
+      }
+
+      // add downstream nodes
+      for (int i = 0; i < op->num_results(); ++i) {
+        pir::Value output_value = op->result(i);
+        for (auto consumer_it = output_value.use_begin(); consumer_it != output_value.use_end(); ++consumer_it) {
+          const auto* output_op = consumer_it->owner();
+          if (op_to_node_map.find(output_op) != op_to_node_map.end()){
+            node->downstream[op_to_node_map[output_op]]= output_value;
+          }
+        }
+      }
+
+      if (node->upstream.size() == 0){
+        entrance_nodes_.emplace(node);
+      }
+
+      if (node->downstream.size() == 0){
+        exit_nodes_.emplace(node);
+      }
+    }
+  }
+
+  ~FusionGraph(){
+    for (FusionNode* node: all_fusion_nodes_){
+      delete node;
+    }
+  }
+
+  std::vector<ir::Expr> DoFusion(){
+    trivial_op_fusion();
+    return get_expr_results();
+  }
+
+private:
+  void trivial_op_fusion(){
+    std::queue<FusionNode*> candidates;
+    std::transform(
+      entrance_nodes_.begin(),
+      entrance_nodes_.end(),
+      std::inserter(bfs_candidates),
+      [](FusionNode* node){return node;}
+    );
+
+    while(!candidates.empty()){
+      FusionNode* upstream = bfs_candidates.front();
+      candidates.pop();
+
+      bool need_fusion = IsTrivialKind(upstream);
+
+      for (const auto& pair_data : cur_node->downstream){
+        FusionNode* downstream = pair_data.first;
+        if (need_fusion){
+          FusionNode* new_node = new FusionNode(
+            TrivialFusion(upstream_node.op_compute_body,downstream_node.op_compute_body),
+            downstream.op_pattern
+          );
+          new_node.init_topo_info(upstream, downstream);
+          candidates.push(new_node);
+          remove_fusion_node(downstream);
+        }else(
+          candidates.push(downstream);
+        )
+      }
+      remove_fusion_node(upstream);
+    }
+  }
+
+  std::vector<ir::Expr> get_expr_results() {
+    std::vector<ir::Expr> output_exprs;
+    for (const auto& node : all_fusion_nodes_) {
+      output_exprs.push_back(node->op_compute_body);
+    }
+    return output_exprs;
+  }
+
+  void remove_fusion_node(FusionNode* node){
+    if (all_fusion_nodes_.find(node) != all_fusion_nodes_.end()){
+      all_fusion_nodes_.erase(node);
+    }
+    if (entrance_nodes_.find(node) != entrance_nodes_.end()){
+      entrance_nodes_.erase(node);
+    }
+    if (exit_nodes_.find(node) != exit_nodes_.end()){
+      exit_nodes_.erase(node);
+    }
+    delete node;
+  }
+
+private:
+  std::unordered_set<FusionNode*> all_fusion_nodes_;
+  std::unordered_set<FusionNode*> entrance_nodes_;
+  std::unordered_set<FusionNode*> exit_nodes_;
+
+  std::unordered_map<pir::Value, ShardableAxes> shardable_axes_;
+
+}
+
 std::vector<FusionNode> ConstructFusionNodeElementwisely(
     const std::vector<ir::Expr>& op_compute_bodies,
     const std::vector<OpPatternKind>& op_kinds) {
@@ -389,6 +545,13 @@ void CheckFusionInputValid(const std::vector<ir::Expr>& op_compute_bodies,
 std::vector<ir::Expr> TrivialOpFusion(
     const std::vector<::pir::Operation*>& ops,
     const std::vector<ir::Expr>& op_compute_bodies) {
+  trivial_fusion_detail::FusionGraph graph = trivial_fusion_detail::FusionGraph(ops, op_compute_bodies);
+  return graph.DoFusion();
+}
+
+std::vector<ir::Expr> TrivialOpFusion_(
+    const std::vector<::pir::Operation*>& ops,
+    const std::vector<ir::Expr>& op_compute_bodies) {
   const auto& op_patterns = trivial_fusion_detail::GetOpPatternKindVector(ops);
   trivial_fusion_detail::CheckFusionInputValid(op_compute_bodies, op_patterns);
   const auto& before_fused_nodes =

From e22f81ddaf116ce1bd2a10bf6c4435a44276a584 Mon Sep 17 00:00:00 2001
From: jiahy0825 <jiahongyu@baidu.com>
Date: Sun, 10 Mar 2024 11:35:34 +0000
Subject: [PATCH 097/114] support multiple sinks in
 group_pattern_util.InferShardableAxes

---
 paddle/cinn/frontend/group_pattern_util.cc | 149 ++++++++++++++++++---
 1 file changed, 131 insertions(+), 18 deletions(-)

diff --git a/paddle/cinn/frontend/group_pattern_util.cc b/paddle/cinn/frontend/group_pattern_util.cc
index 44d757a1ab867..b277c3018269b 100644
--- a/paddle/cinn/frontend/group_pattern_util.cc
+++ b/paddle/cinn/frontend/group_pattern_util.cc
@@ -215,10 +215,8 @@ std::unordered_map<pir::Value, ShardableAxes> ReversedInferShardableAxes(
   std::unordered_map<pir::Value, ShardableAxes> value2shardable_axes;
   std::list<const pir::Operation*> sinks;
   for (auto iter = sink_and_init_begin; iter != sink_and_init_end; ++iter) {
-    const pir::Operation* sink = iter->first;
-    CHECK_EQ(sink->num_results(), 1);
-    sinks.push_back(sink);
-    value2shardable_axes[sink->result(0)] = iter->second;
+    sinks.push_back(iter->first.defining_op());
+    value2shardable_axes[iter->first] = iter->second;
   }
   const auto& UpdateValue2ShardableAxes = [&](pir::Value value, const ShardableAxes& sa) {
     auto iter = value2shardable_axes.find(value);
@@ -228,7 +226,7 @@ std::unordered_map<pir::Value, ShardableAxes> ReversedInferShardableAxes(
       iter->second = sa;
     }
   };
-  reversed_walker(sinks.begin(), sinks.end(), [&](const auto* op){
+  reversed_walker(sinks.begin(), sinks.end(), [&](const auto* op) {
     auto shardable_axes_sig = MakeShardableAxesSignature4Op(op);
     const auto& old2new = ShardableAxesUtil::GetOldName2NewName(shardable_axes_sig.output_shardable_axes,
                                               value2shardable_axes.at(op->result(0)));
@@ -248,8 +246,9 @@ std::unordered_map<pir::Value, ShardableAxes> ReversedInferShardableAxes(
     const common::TopoWalker<const pir::Operation*>& reversed_walker,
     const pir::Operation* sink,
     const ShardableAxes& init_sa) {
-  using OpAndInitValue = std::pair<const pir::Operation*, ShardableAxes>;
-  std::array<OpAndInitValue, 1> sinks{OpAndInitValue{sink, init_sa}};
+  using OpAndInitValue = std::pair<pir::Value, ShardableAxes>;
+  CHECK_EQ(sink->num_results(), 1);
+  std::array<OpAndInitValue, 1> sinks{OpAndInitValue{sink->result(0), init_sa}};
   return ReversedInferShardableAxes(reversed_walker, sinks.begin(), sinks.end());
 }
 
@@ -293,6 +292,128 @@ std::list<const pir::Operation*> GetSinks(
   return sinks;
 }
 
+std::unordered_map<const pir::Operation*, ShardableAxesSignature>
+GetOp2ShardableAxesSignature(const std::unordered_set<const pir::Operation*>& ops) {
+  std::unordered_map<const pir::Operation*, ShardableAxesSignature> ret;
+  for (const auto* op : ops) {
+    ret[op] = MakeShardableAxesSignature4Op(op);
+  }
+  return ret;
+}
+
+std::map<std::string, std::vector<std::string>>
+GetAxisName2BoundAxisName(
+    const std::unordered_set<const pir::Operation*>& ops,
+    const std::unordered_map<const pir::Operation*, ShardableAxesSignature>& op2shardable_axes_signature) {
+  const auto GetInputShardableAxes = [&](const OpAndOperandIndex& op_and_idx) -> std::optional<const ShardableAxes*> {
+    const auto& [op, idx] = op_and_idx;
+    const auto* input_op = op->operand_source(idx).defining_op();
+    if (ops.count(input_op) == 0) return std::nullopt;
+    const auto& iter = op2shardable_axes_signature.find(input_op);
+    if (iter == op2shardable_axes_signature.end()) return std::nullopt;
+    const auto& output_sa = iter->second.output_shardable_axes;
+    return &output_sa;
+  };
+  std::map<std::string, std::vector<std::string>> axis_name2bound_axis_name;
+  const auto UpdateAxisName2BoundAxisName = [&](const ShardableAxes& input_sa, const ShardableAxes& sa) {
+    for (const auto& [input_axis, input_axis_name] : input_sa) {
+      for (const auto& [axis, axis_name] : sa) {
+        if (input_axis != axis) continue;
+        axis_name2bound_axis_name[axis_name].push_back(input_axis_name);
+        axis_name2bound_axis_name[input_axis_name].push_back(axis_name);
+      }
+    }
+  };
+  for (const auto& [op, signature] : op2shardable_axes_signature) {
+    for (const auto& [op_and_idx, sa] : signature.input_shardable_axes) {
+      const auto& input_sa = GetInputShardableAxes(op_and_idx);
+      if (!input_sa.has_value()) continue;
+      UpdateAxisName2BoundAxisName(*input_sa.value(), sa);
+    }
+  }
+  return axis_name2bound_axis_name;
+}
+
+std::unordered_map<std::string, std::string>
+GetAxisName2UnionFindSetRoot(
+    const std::unordered_set<const pir::Operation*>& ops,
+    const std::unordered_map<const pir::Operation*, ShardableAxesSignature>& op2shardable_axes_signature) {
+  const auto axis_name2bound_axis_name = GetAxisName2BoundAxisName(ops, op2shardable_axes_signature);
+  using NodeVisitor = std::function<void(const std::string&)>;
+  const auto VisitNext = [&](const std::string& axis_name, const NodeVisitor& DoEach) {
+    const auto& iter = axis_name2bound_axis_name.find(axis_name);
+    if (iter == axis_name2bound_axis_name.end()) return;
+    for (const auto& input_axis_name : iter->second) {
+      DoEach(input_axis_name);
+    }
+  };
+  common::BfsWalker<std::string> walk(VisitNext);
+  std::unordered_map<std::string, std::string> axis_name2root;
+  for (const auto& [union_find_root, _] : axis_name2bound_axis_name) {
+    if (axis_name2root.count(union_find_root) > 0) continue;
+    walk(union_find_root, [&](const std::string& axis_name){
+      CHECK(axis_name2root.emplace(axis_name, union_find_root).second);
+    });
+  }
+  return axis_name2root;
+}
+
+std::unordered_map<pir::Value, ShardableAxes>
+GetSinkAndInitShardableAxes(
+    const std::list<const pir::Operation*>& sinks,
+    const std::unordered_map<const pir::Operation*, ShardableAxesSignature>& op2shardable_axes_signature,
+    const std::unordered_map<std::string, std::string>& axis_name2union_find_set_root) {
+  const auto& ConvertByBoundAxisName = [&](const ShardableAxes& sa) {
+    ShardableAxes ret_sa;
+    for (const auto& [axis, axis_name] : sa) {
+      const auto& iter = axis_name2union_find_set_root.find(axis_name);
+      CHECK(iter != axis_name2union_find_set_root.end());
+      ret_sa.emplace_back(ShardableAxis{
+        .axis=axis,
+        .axis_name=iter->second,
+      });
+    }
+    return ret_sa;
+  };
+  std::unordered_map<pir::Value, ShardableAxes> sink2sa;
+  for (const auto* sink : sinks) {
+    const auto& sig_iter = op2shardable_axes_signature.find(sink);
+    CHECK(sig_iter != op2shardable_axes_signature.end());
+    const auto& output_shardable_axes = sig_iter->second.output_shardable_axes;
+    CHECK_EQ(sink->num_results(), 1);
+    sink2sa[sink->result(0)] = ConvertByBoundAxisName(output_shardable_axes);
+  }
+  return sink2sa;
+}
+
+void RenameDuplicatedAxisName(std::unordered_map<pir::Value, ShardableAxes>* sink2sa) {
+  const auto& RenameDuplicated = [&](ShardableAxes* sa) {
+    std::set<std::string> existed_axis_name;
+    for (auto& [_, axis_name] : *sa) {
+      if (!existed_axis_name.emplace(axis_name).second) {
+        axis_name = axis_name + "_" + std::to_string(ShardableAxis::UnqiueSeqNo());
+      } else {
+        // do nothing.
+      }
+    }
+  };
+  for (auto& [_, sa] : *sink2sa) {
+    RenameDuplicated(&sa);
+  }
+}
+
+std::unordered_map<pir::Value, ShardableAxes> GetSinkAndInitValues(
+    const common::TopoWalker<const pir::Operation*>& reverse_walker,
+    const std::unordered_set<const pir::Operation*>& ops,
+    const std::list<const pir::Operation*>& sinks) {
+  const auto& op2shardable_axes_signature = GetOp2ShardableAxesSignature(ops);
+  const auto& axis_name2union_find_set_root = GetAxisName2UnionFindSetRoot(ops, op2shardable_axes_signature);
+  std::unordered_map<pir::Value, ShardableAxes> sink_and_inits =
+      GetSinkAndInitShardableAxes(sinks, op2shardable_axes_signature, axis_name2union_find_set_root);
+  RenameDuplicatedAxisName(&sink_and_inits);
+  return sink_and_inits;
+}
+
 class StmtFusionHelper {
  public:
   explicit StmtFusionHelper(const cinn::dialect::FusionOp& fusion_op)
@@ -703,17 +824,9 @@ std::unordered_map<pir::Value, ShardableAxes> InferShardableAxesFromSink(
 
 std::unordered_map<pir::Value, ShardableAxes> InferShardableAxes(const std::unordered_set<const pir::Operation*>& ops) {
   auto reversed_walker = GetOpsTopoWalker(ops);
-  const pir::Operation* sink = [&]{
-    const auto& sinks = GetSinks(ops);
-    CHECK_EQ(sinks.size(), 1) << "ops must have only one sink node.";
-    return *sinks.begin();
-  }();
-  const auto& value2shardable_axes = [&]{
-    size_t rank = GetRank(sink->result(0));
-    const auto& init_sa = ShardableAxesUtil::GetFullyShardableAxes(rank);
-    return ReversedInferShardableAxes(reversed_walker, sink, init_sa);
-  }();
-  return value2shardable_axes;
+  const auto& sinks = GetSinks(ops);
+  const auto& sink_and_init_value = GetSinkAndInitValues(reversed_walker, ops, sinks);
+  return ReversedInferShardableAxes(reversed_walker, sink_and_init_value.begin(), sink_and_init_value.end());
 }
 
 }
\ No newline at end of file

From 04f5f5902d9dec38084618db41a75438e250a2d8 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Sun, 10 Mar 2024 21:03:18 +0800
Subject: [PATCH 098/114] [PIR+CINN]Fix cinn_op.GroupOp insert bug for
 WriteAfterRead (#62529)

* [PIR+CINN]Fix cinn_op.GroupOp insert bug for WriteAfterRead

* fix bug

* refine code

* fix cond typo

* fix std::distance

* add strong verify after build_cinn_pass

* fix typo
---
 .../hlir/dialect/operator/ir/manual_op.cc     |  8 ++-
 .../cinn/hlir/dialect/operator/ir/manual_op.h |  3 +-
 .../fluid/pir/transforms/build_cinn_pass.cc   | 48 +++++++++++++
 .../pir/transforms/sub_graph_detector.cc      | 70 +++++++++++++++++++
 paddle/pir/include/core/operation.h           |  2 +-
 5 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
index 0def6a8491e9e..2fe01d4e373d3 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -81,7 +81,13 @@ pir::Block* GroupOp::block() {
   return &region.front();
 }
 
-std::vector<pir::Operation*> GroupOp::GetOperators() {
+pir::Block* GroupOp::block() const {
+  pir::Region& region = (*this)->region(0);
+  CHECK(!region.empty());
+  return &region.front();
+}
+
+std::vector<pir::Operation*> GroupOp::GetOperators() const {
   std::vector<pir::Operation*> rt_ops;
   for (auto& op : *block()) {
     rt_ops.push_back(&op);
diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
index 9273a722e25c5..4badd14dbc2d5 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.h
@@ -50,7 +50,8 @@ class IR_API GroupOp
                     const cinn::dialect::GroupInfo &group_info);
 
   pir::Block *block();
-  std::vector<pir::Operation *> GetOperators();
+  pir::Block *block() const;
+  std::vector<pir::Operation *> GetOperators() const;
 
   bool InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis);
 
diff --git a/paddle/fluid/pir/transforms/build_cinn_pass.cc b/paddle/fluid/pir/transforms/build_cinn_pass.cc
index bce67a08c612c..4daa4be6445b2 100644
--- a/paddle/fluid/pir/transforms/build_cinn_pass.cc
+++ b/paddle/fluid/pir/transforms/build_cinn_pass.cc
@@ -25,6 +25,8 @@ namespace {
 using GroupOpsVec = std::vector<pir::Operation*>;
 using CompatibleInfo = cinn::hlir::framework::pir::CompatibleInfo;
 
+void VerifyOperationOrder(const pir::Block& block);
+
 class BuildCinnPass : public pir::Pass {
  public:
   BuildCinnPass() : pir::Pass("build_cinn_pass", /*opt_level=*/1) {}
@@ -33,6 +35,7 @@ class BuildCinnPass : public pir::Pass {
     for (uint32_t i = 0; i < op->num_regions(); ++i) {
       for (auto& block : op->region(i)) {
         ProcessBlock(&block);
+        VerifyOperationOrder(block);
       }
     }
   }
@@ -56,6 +59,51 @@ class BuildCinnPass : public pir::Pass {
     }
   }
 };
+
+void VerifyOperationOrder(const pir::Block& block) {
+  auto order_info =
+      [&]() -> std::unordered_map<const pir::Operation*, int64_t> {
+    std::unordered_map<const pir::Operation*, int64_t> map;
+    // initialize the position index with block size by default.
+    const int64_t block_size = block.size();
+    for (auto& op : block) map[&op] = block_size;
+    return map;
+  }();
+  const auto& CheckOpOrder = [&](const pir::Operation* op) -> void {
+    const pir::Operation* current_op = op;
+    for (auto& value : op->operands_source()) {
+      if (!value || !value.defining_op()) continue;
+      pir::Operation* defining_op = value.defining_op();
+      if (order_info.count(defining_op) == 0) continue;
+      if (op->GetParentOp() &&
+          op->GetParentOp()->isa<cinn::dialect::GroupOp>()) {
+        current_op = op->GetParentOp();
+      }
+      CHECK(order_info.at(defining_op) < order_info.at(current_op))
+          << "The order of operations is not correct!"
+          << " Received defining_op(" << defining_op->id() << " "
+          << order_info.at(defining_op) << ") is behind current_op("
+          << current_op->id() << " " << order_info.at(current_op) << ")";
+    }
+  };
+  const auto& CheckGroupOpOrder = [&](pir::Operation* op) -> void {
+    auto group_op = op->dyn_cast<cinn::dialect::GroupOp>();
+    for (auto& inner_op : *group_op.block()) {
+      CheckOpOrder(&inner_op);
+    }
+  };
+
+  int64_t index = 0;
+  for (auto& op : block) {
+    order_info[&op] = index++;
+    if (op.isa<cinn::dialect::GroupOp>()) {
+      CheckGroupOpOrder(&op);
+    } else {
+      CheckOpOrder(&op);
+    }
+  }
+}
+
 }  // namespace
 
 namespace pir {
diff --git a/paddle/fluid/pir/transforms/sub_graph_detector.cc b/paddle/fluid/pir/transforms/sub_graph_detector.cc
index 24d2c61f98d4c..c9d12e9f498d0 100644
--- a/paddle/fluid/pir/transforms/sub_graph_detector.cc
+++ b/paddle/fluid/pir/transforms/sub_graph_detector.cc
@@ -16,6 +16,7 @@
 
 #include <memory>
 
+#include <iterator>
 #include <queue>
 #include <regex>
 #include <set>
@@ -513,6 +514,74 @@ pir::Operation* FindInsertPoint(const GroupOpsVec& group_ops,
   }
   return insert_point_op;
 }
+
+struct IncrementalOrder {
+  bool operator()(const pir::Operation* lhs, const pir::Operation* rhs) const {
+    CHECK(lhs->GetParent() == rhs->GetParent())
+        << "lhs and rhs should have same parent block.";
+    auto lhs_iter = lhs->operator Block::ConstIterator();
+    auto rhs_iter = rhs->operator Block::ConstIterator();
+    auto end_iter = lhs->GetParent()->end();
+    while (lhs_iter != end_iter) {
+      lhs_iter++;
+      if (lhs_iter == rhs_iter) return true;
+      if (lhs_iter == end_iter) return false;
+    }
+    CHECK(false) << "rhs " << rhs->id() << " is not reachable from lhs "
+                 << lhs->id();
+    return false;
+  }
+};
+
+std::unordered_set<pir::Operation*> GetUpstreamOpsAfterPosition(
+    const pir::Operation* position_op,
+    const pir::Block* block,
+    const pir::Operation* op,
+    std::unordered_set<pir::Operation*>* visited_ops) {
+  std::unordered_set<pir::Operation*> ops;
+  const auto& IsInBlock = [](const pir::Operation* src_op,
+                             const pir::Block* block) {
+    for (auto& op : *block) {
+      if (src_op == &op) return true;
+    }
+    return false;
+  };
+
+  for (auto value : op->operands_source()) {
+    if (!value || !value.defining_op()) continue;
+    pir::Operation* defining_op = value.defining_op();
+    if (visited_ops->count(defining_op)) continue;
+    visited_ops->insert(defining_op);
+    if (!IsInBlock(defining_op, block)) continue;
+    if (IncrementalOrder()(defining_op, position_op)) continue;
+
+    ops.insert(defining_op);
+    auto recursive_ops = GetUpstreamOpsAfterPosition(
+        position_op, block, defining_op, visited_ops);
+    ops.insert(recursive_ops.begin(), recursive_ops.end());
+  }
+  return ops;
+}
+
+void MoveUpstreamOpBeforeGroup(const GroupOpsVec& group_ops,
+                               pir::Block* block,
+                               pir::Operation* insert_point_op) {
+  const auto moved_ops = [&]() {
+    std::set<pir::Operation*, IncrementalOrder> ops_set;
+    std::unordered_set<pir::Operation*> visited_ops;
+    for (auto& op : group_ops) {
+      auto upstream_ops =
+          GetUpstreamOpsAfterPosition(insert_point_op, block, op, &visited_ops);
+      ops_set.insert(upstream_ops.begin(), upstream_ops.end());
+    }
+    return ops_set;
+  }();
+
+  for (auto& op : moved_ops) {
+    VLOG(5) << "Move " << op->name() << " before " << insert_point_op->name();
+    op->MoveTo(block, insert_point_op->operator Block::Iterator());
+  }
+}
 }  // namespace
 
 void ReplaceWithGroupOp(pir::Block* block,
@@ -527,6 +596,7 @@ void ReplaceWithGroupOp(pir::Block* block,
 
   // step 1: Analysis and insert group op before insert_point.
   auto* insert_point = FindInsertPoint(group_ops, outputs);
+  MoveUpstreamOpBeforeGroup(group_ops, block, insert_point);
   builder.set_insertion_point(insert_point);
   VLOG(6) << "Insert GroupOp after " << insert_point->name();
 
diff --git a/paddle/pir/include/core/operation.h b/paddle/pir/include/core/operation.h
index 66d5da9d0d8ab..282de9b03d7e7 100644
--- a/paddle/pir/include/core/operation.h
+++ b/paddle/pir/include/core/operation.h
@@ -229,7 +229,7 @@ class IR_API alignas(8) Operation final
 
   void Verify();
 
-  uint64_t id() { return id_; }
+  uint64_t id() const { return id_; }
 
  private:
   DISABLE_COPY_AND_ASSIGN(Operation);

From c84c50c2e3b0ddde90fe005c1c5c4f873ad19c89 Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Sun, 10 Mar 2024 13:31:51 +0000
Subject: [PATCH 099/114] update

---
 paddle/cinn/frontend/CMakeLists.txt          |   4 +-
 paddle/cinn/hlir/framework/pir/trivial_op.cc | 202 +++++++++++--------
 2 files changed, 118 insertions(+), 88 deletions(-)

diff --git a/paddle/cinn/frontend/CMakeLists.txt b/paddle/cinn/frontend/CMakeLists.txt
index 3360b9620edb5..9171de8f62769 100755
--- a/paddle/cinn/frontend/CMakeLists.txt
+++ b/paddle/cinn/frontend/CMakeLists.txt
@@ -10,8 +10,8 @@ gather_srcs(
   op_mapper_registry.cc
   paddle_model_convertor.cc
   program_pass.cc
-  optimize.cc
-  group_pattern_util.cc)
+  optimize.cc)
+  # group_pattern_util.cc)
 
 if(NOT WITH_CUDA)
   cinn_cc_test(
diff --git a/paddle/cinn/hlir/framework/pir/trivial_op.cc b/paddle/cinn/hlir/framework/pir/trivial_op.cc
index 16f3c9f76786d..3d8a45f495c66 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op.cc
+++ b/paddle/cinn/hlir/framework/pir/trivial_op.cc
@@ -279,33 +279,73 @@ ir::Expr TrivialFusion(ir::Expr upper, ir::Expr down) {
   return fused.GetFuncBody();
 }
 
+
+void CheckFusionInputValid(const std::vector<ir::Expr>& op_compute_bodies,
+                           const std::vector<OpPatternKind>& op_patterns) {
+  if (VLOG_IS_ON(4)) {
+    for (const auto& func : op_compute_bodies) {
+      VLOG(4) << "TrivialOpFusion: {FuncBody is} :" << func;
+    }
+    for (const auto& op_ptn : op_patterns) {
+      VLOG(4) << "OpPattern is :" << op_ptn;
+    }
+  }
+  VLOG(4) << "      op_patterns.size() = " << op_compute_bodies.size();
+  VLOG(4) << "op_compute_bodies.size() = " << op_patterns.size();
+  PADDLE_ENFORCE_EQ(
+      op_patterns.size(), op_compute_bodies.size(), "ops and  size not equal");
+}
+
 struct FusionNode {
   // Function bodies losses the kind information which needed in trivialop
   // fusion.
   std::vector<ir::Expr> op_compute_body;
   OpPatternKind op_pattern;
 
-  std::vector<::pir::Operator*> output_ops;
+  ::pir::Operation* expr_related_op;
 
-  std::unordered_map<FusionNode*, pir::Value> upstream;
-  std::unordered_map<FusionNode*, pir::Value> downstream;
+  std::unordered_map<FusionNode*, ::pir::Value> upstream;
+  std::unordered_map<FusionNode*, ::pir::Value> downstream;
 
   explicit FusionNode(ir::Expr op_compute_body, OpPatternKind op_pattern)
-      : op_compute_body(op_compute_body), op_pattern(op_pattern) {}
+      : op_compute_body({op_compute_body}), op_pattern(op_pattern) {}
+
+  void replace_topo_structure_of_fused_nodes(FusionNode* fused_up_node, FusionNode* fused_down_node){
+    upstream.insert(fused_up_node->upstream.begin(), fused_up_node->upstream.end());
+    upstream.insert(fused_down_node->upstream.begin(), fused_down_node->upstream.end());
+    upstream.erase(fused_up_node);
+
+    downstream.insert(fused_up_node->downstream.begin(), fused_up_node->downstream.end());
+    downstream.insert(fused_down_node->downstream.begin(), fused_down_node->downstream.end());
+    downstream.erase(fused_down_node);
 
-  void init_topo_info(FusionNode* upstream_node, FusionNode* downstream_node){
-    upstream.insert(upstream.end(), upstream_node.upstream.begin(), upstream_node.upstream.end());
-    upstream.insert(upstream.end(), downstream_node.upstream.begin(), downstream_node.upstream.end());
-    upstream.erase(upstream_node);
+    expr_related_op = fused_down_node->expr_related_op;
 
-    downstream.insert(downstream.end(), upstream_node.upstream.begin(), upstream_node.upstream.end());
-    downstream.insert(downstream.end(), downstream_node.upstream.begin(), downstream_node.upstream.end());
-    downstream.erase(downstream_node);
+    for (const auto& pair_data: upstream){
+      FusionNode* upstream_node = pair_data.first;
+      ::pir::Value related_value = pair_data.second;
+      if (upstream_node->downstream.find(fused_up_node) != upstream_node->downstream.end()){
+        upstream_node->downstream.erase(fused_up_node);
+        upstream_node->downstream[this] = related_value;
+      }
+      if (upstream_node->downstream.find(fused_down_node) != upstream_node->downstream.end()){
+        upstream_node->downstream.erase(fused_down_node);
+        upstream_node->downstream[this] = related_value;
+      }
+    }
 
-    output_ops.insert(output_ops.end(), upstream_node.output_ops.begin(), upstream_node.output_ops.end());
-    output_ops.insert(output_ops.end(), downstream_node.output_ops.begin(), downstream_node.output_ops.end());
-    upstream_node->downstream[downstream_node].defining_op();
-    output_ops.erase();
+    for (const auto& pair_data: downstream){
+      FusionNode* downstream_node = pair_data.first;
+      ::pir::Value related_value = pair_data.second;
+      if (downstream_node->upstream.find(fused_up_node) != downstream_node->upstream.end()){
+        downstream_node->upstream.erase(fused_up_node);
+        downstream_node->upstream[this] = related_value;
+      }
+      if (downstream_node->upstream.find(fused_down_node) != downstream_node->upstream.end()){
+        downstream_node->upstream.erase(fused_down_node);
+        downstream_node->upstream[this] = related_value;
+      }
+    }
   }
 
 };
@@ -318,51 +358,51 @@ struct FusionGraph {
 
     // shardable_axes_ = InferShardableAxes(ops);
 
-    const auto& op_patterns = trivial_fusion_detail::GetOpPatternKindVector(ops);
-    trivial_fusion_detail::CheckFusionInputValid(op_compute_bodies, op_patterns);
+    const auto& op_patterns = GetOpPatternKindVector(ops);
+    CheckFusionInputValid(op_compute_bodies, op_patterns);
 
     std::unordered_map<::pir::Operation*, FusionNode*> op_to_node_map;
 
     for (int i=0; i<ops.size(); ++i){
-      if (ops[i]->isa<pir::YieldOp()>)
-        continue;
       FusionNode* node = new FusionNode(op_compute_bodies[i], op_patterns[i]);
       op_to_node_map[ops[i]] = node;
       all_fusion_nodes_.emplace(node);
-      node->output_op.emplace_back(ops[i]);
+      node->expr_related_op = ops[i];
     }
 
-    for (const ::pir::Operation* op : ops){
-      if (op->isa<pir::YieldOp()>)
-        continue;
-      FusionNode* node = op_to_node_map[op];
+    for (::pir::Operation* op : ops){
+      FusionNode* cur_node = op_to_node_map[op];
 
       // add upstream nodes
       for (int i = 0; i < op->num_operands(); ++i){
-        pir::Value input_value = op->operand_source(i);
-        const ::pir::Operation* input_op = input_value.defining_op();
+        ::pir::Value related_value = op->operand_source(i);
+        ::pir::Operation* input_op = related_value.defining_op();
         if (op_to_node_map.find(input_op) != op_to_node_map.end()){
-          node->upstream[op_to_node_map[input_op]] = input_value;
+          FusionNode* upstream_node = op_to_node_map[input_op];
+          cur_node->upstream[upstream_node] = related_value;
+          upstream_node->downstream[cur_node] = related_value;
         }
       }
 
       // add downstream nodes
       for (int i = 0; i < op->num_results(); ++i) {
-        pir::Value output_value = op->result(i);
-        for (auto consumer_it = output_value.use_begin(); consumer_it != output_value.use_end(); ++consumer_it) {
-          const auto* output_op = consumer_it->owner();
+        ::pir::Value related_value = op->result(i);
+        for (auto consumer_it = related_value.use_begin(); consumer_it != related_value.use_end(); ++consumer_it) {
+          ::pir::Operation* output_op = consumer_it->owner();
           if (op_to_node_map.find(output_op) != op_to_node_map.end()){
-            node->downstream[op_to_node_map[output_op]]= output_value;
+            FusionNode* downstream_node = op_to_node_map[output_op];
+            cur_node->downstream[downstream_node]= related_value;
+            downstream_node->upstream[cur_node] = related_value;
           }
         }
       }
 
-      if (node->upstream.size() == 0){
-        entrance_nodes_.emplace(node);
+      if (cur_node->upstream.size() == 0){
+        entrance_nodes_.emplace(cur_node);
       }
 
-      if (node->downstream.size() == 0){
-        exit_nodes_.emplace(node);
+      if (cur_node->downstream.size() == 0){
+        exit_nodes_.emplace(cur_node);
       }
     }
   }
@@ -379,34 +419,30 @@ struct FusionGraph {
   }
 
 private:
-  void trivial_op_fusion(){
-    std::queue<FusionNode*> candidates;
-    std::transform(
-      entrance_nodes_.begin(),
-      entrance_nodes_.end(),
-      std::inserter(bfs_candidates),
-      [](FusionNode* node){return node;}
-    );
-
-    while(!candidates.empty()){
-      FusionNode* upstream = bfs_candidates.front();
-      candidates.pop();
-
-      bool need_fusion = IsTrivialKind(upstream);
+  FusionNode* find_trivial_node(){
+    for (FusionNode* node: all_fusion_nodes_){
+      if (IsTrivialKind(node->op_pattern) && node->downstream.size() > 0){
+        CHECK(node->op_compute_body.size() == 1);
+        return node;
+      }
+    }
+    return nullptr;
+  }
 
-      for (const auto& pair_data : cur_node->downstream){
+  void trivial_op_fusion(){
+    FusionNode* upstream;
+    while((upstream = find_trivial_node()) != nullptr){
+      for (const auto& pair_data : upstream->downstream){
         FusionNode* downstream = pair_data.first;
-        if (need_fusion){
-          FusionNode* new_node = new FusionNode(
-            TrivialFusion(upstream_node.op_compute_body,downstream_node.op_compute_body),
-            downstream.op_pattern
-          );
-          new_node.init_topo_info(upstream, downstream);
-          candidates.push(new_node);
-          remove_fusion_node(downstream);
-        }else(
-          candidates.push(downstream);
-        )
+        CHECK(downstream->op_compute_body.size() == 1);
+
+        FusionNode* new_node = new FusionNode(
+          TrivialFusion(upstream->op_compute_body[0], downstream->op_compute_body[0]),
+          downstream->op_pattern
+        );
+        new_node->replace_topo_structure_of_fused_nodes(upstream, downstream);
+        append_fusion_node(new_node);
+        remove_fusion_node(downstream);
       }
       remove_fusion_node(upstream);
     }
@@ -415,7 +451,7 @@ struct FusionGraph {
   std::vector<ir::Expr> get_expr_results() {
     std::vector<ir::Expr> output_exprs;
     for (const auto& node : all_fusion_nodes_) {
-      output_exprs.push_back(node->op_compute_body);
+      output_exprs.insert(output_exprs.end(), node->op_compute_body.begin(), node->op_compute_body.end());
     }
     return output_exprs;
   }
@@ -433,14 +469,24 @@ struct FusionGraph {
     delete node;
   }
 
+  void append_fusion_node(FusionNode* node){
+    all_fusion_nodes_.emplace(node);
+    if (node->upstream.size() == 0){
+      entrance_nodes_.emplace(node);
+    }
+
+    if (node->downstream.size() == 0){
+      exit_nodes_.emplace(node);
+    }
+  }
+
 private:
   std::unordered_set<FusionNode*> all_fusion_nodes_;
   std::unordered_set<FusionNode*> entrance_nodes_;
   std::unordered_set<FusionNode*> exit_nodes_;
 
-  std::unordered_map<pir::Value, ShardableAxes> shardable_axes_;
-
-}
+  // std::unordered_map<::pir::Value, ShardableAxes> shardable_axes_;
+};
 
 std::vector<FusionNode> ConstructFusionNodeElementwisely(
     const std::vector<ir::Expr>& op_compute_bodies,
@@ -457,8 +503,8 @@ bool IsAdjecentInjectiveBetween(const FusionNode& upstream_node,
   return upstream_node.op_compute_body != downstream_node.op_compute_body &&
          IsTrivialKind(upstream_node.op_pattern) &&
          IsTrivialKind(downstream_node.op_pattern) &&
-         IsAdjecent(upstream_node.op_compute_body,
-                    downstream_node.op_compute_body);
+         IsAdjecent(upstream_node.op_compute_body[0],
+                    downstream_node.op_compute_body[0]);
 }
 
 std::optional<FusionNode> FindUpstreamNodeUsedByOthers(
@@ -483,8 +529,8 @@ std::vector<FusionNode> FuseEachUpstreamUse(
       std::back_inserter(fused_nodes),
       [&](const FusionNode& downstream_node) {
         if (IsAdjecentInjectiveBetween(upstream_node, downstream_node)) {
-          return FusionNode(TrivialFusion(upstream_node.op_compute_body,
-                                          downstream_node.op_compute_body),
+          return FusionNode(TrivialFusion(upstream_node.op_compute_body[0],
+                                          downstream_node.op_compute_body[0]),
                             OpPatternKind::kInjective);
         }
         return downstream_node;
@@ -519,27 +565,11 @@ std::vector<ir::Expr> ExtractBodiesFromFusionNodes(
     const std::vector<FusionNode>& fusion_nodes) {
   std::vector<ir::Expr> output_exprs;
   for (const auto& node : fusion_nodes) {
-    output_exprs.push_back(node.op_compute_body);
+    output_exprs.emplace_back(node.op_compute_body[0]);
   }
   return output_exprs;
 }
 
-void CheckFusionInputValid(const std::vector<ir::Expr>& op_compute_bodies,
-                           const std::vector<OpPatternKind>& op_patterns) {
-  if (VLOG_IS_ON(4)) {
-    for (const auto& func : op_compute_bodies) {
-      VLOG(4) << "TrivialOpFusion: {FuncBody is} :" << func;
-    }
-    for (const auto& op_ptn : op_patterns) {
-      VLOG(4) << "OpPattern is :" << op_ptn;
-    }
-  }
-  VLOG(4) << "      op_patterns.size() = " << op_compute_bodies.size();
-  VLOG(4) << "op_compute_bodies.size() = " << op_patterns.size();
-  PADDLE_ENFORCE_EQ(
-      op_patterns.size(), op_compute_bodies.size(), "ops and  size not equal");
-}
-
 }  // namespace trivial_fusion_detail
 
 std::vector<ir::Expr> TrivialOpFusion(

From 2f0c3845b01915cef931eb1741b524c3f54e8dd3 Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Sun, 10 Mar 2024 17:31:42 +0000
Subject: [PATCH 100/114] update

---
 paddle/cinn/hlir/framework/pir/trivial_op.cc | 438 +++++++++++--------
 1 file changed, 264 insertions(+), 174 deletions(-)

diff --git a/paddle/cinn/hlir/framework/pir/trivial_op.cc b/paddle/cinn/hlir/framework/pir/trivial_op.cc
index 3d8a45f495c66..14e1ce86bd3c8 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op.cc
+++ b/paddle/cinn/hlir/framework/pir/trivial_op.cc
@@ -40,6 +40,87 @@ namespace framework {
 namespace pir {
 namespace trivial_fusion_detail {
 
+std::vector<OpPatternKind> GetOpPatternKindVector(
+    const std::vector<::pir::Operation*>& ops) {
+  const auto& op_pattern_map =
+      Operator::GetAttrs<cinn::hlir::framework::OpPatternKind>("OpPattern");
+  std::vector<OpPatternKind> op_patterns;
+  const auto ConvertToPattern = [&op_pattern_map](const ::pir::Operation* op) {
+    const std::string cinn_op_name = CompatibleInfo::OpName(*op);
+    const hlir::framework::Operator* cinn_op = Operator::Get(cinn_op_name);
+    return op_pattern_map[cinn_op];
+  };
+  std::transform(ops.begin(),
+                 ops.end(),
+                 std::back_inserter(op_patterns),
+                 ConvertToPattern);
+  return op_patterns;
+}
+
+template <class A, class C, class Func>
+void SequenceMutator(const std::vector<A>& as, C* acc, const Func& mutator) {
+  VLOG(4) << "SequenceTransform Init: " << acc;
+  for (int i = 0; i < as.size(); ++i) {
+    mutator(as[i], acc);
+    VLOG(4) << "SequenceTransform Iter: " << acc;
+  }
+}
+
+static bool IsAdjecent(const ir::Expr& upstream, const ir::Expr& downstream) {
+  // 1. Get inputs / output from Expr, then we can tell whether they are
+  // adjecent.
+  std::set<Expr> upstream_stores =
+      cinn::ir::ir_utils::CollectIRNodesWithoutTensor(
+          upstream, [](const Expr* expr) {
+            return expr->As<ir::Store>() &&
+                   expr->As<ir::Store>()->is_addr_tensor();
+          });
+  // don't support multi-output yet.
+  PADDLE_ENFORCE(upstream_stores.size() == 1,
+                 "The expr of injective should have only one store");
+
+  std::set<Expr> downstream_loads =
+      cinn::ir::ir_utils::CollectIRNodesWithoutTensor(
+          downstream, [](const Expr* expr) {
+            return expr->As<ir::Load>() &&
+                   expr->As<ir::Load>()->is_addr_tensor();
+          });
+
+  for (const auto& upstream_store : upstream_stores) {
+    for (const auto& downstream_load : downstream_loads) {
+      if (upstream_store.As<ir::Store>()->tensor.As<ir::_Tensor_>()->name ==
+          downstream_load.As<ir::Load>()->tensor.As<ir::_Tensor_>()->name) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+inline bool IsTrivialKind(OpPatternKind kind) {
+  return kind == OpPatternKind::kElementWise ||
+         kind == OpPatternKind::kBroadcast || kind == OpPatternKind::kInjective;
+}
+
+
+void CheckFusionInputValid(const std::vector<ir::Expr>& op_compute_bodies,
+                           const std::vector<OpPatternKind>& op_patterns) {
+  if (VLOG_IS_ON(4)) {
+    for (const auto& func : op_compute_bodies) {
+      VLOG(4) << "TrivialOpFusion: {FuncBody is} :" << func;
+    }
+    for (const auto& op_ptn : op_patterns) {
+      VLOG(4) << "OpPattern is :" << op_ptn;
+    }
+  }
+  VLOG(4) << "      op_patterns.size() = " << op_compute_bodies.size();
+  VLOG(4) << "op_compute_bodies.size() = " << op_patterns.size();
+  PADDLE_ENFORCE_EQ(
+      op_patterns.size(), op_compute_bodies.size(), "ops and  size not equal");
+}
+
+namespace ComposeUtils{
+
 struct MappingLoadStoreExprToDestExprMutator : public ir::IRMutator<> {
   explicit MappingLoadStoreExprToDestExprMutator(const ir::Expr& source,
                                                  const ir::Expr& dest)
@@ -70,48 +151,84 @@ struct MappingLoadStoreExprToDestExprMutator : public ir::IRMutator<> {
   ir::Expr dest_;
 };
 
-std::vector<OpPatternKind> GetOpPatternKindVector(
-    const std::vector<::pir::Operation*>& ops) {
-  const auto& op_pattern_map =
-      Operator::GetAttrs<cinn::hlir::framework::OpPatternKind>("OpPattern");
-  std::vector<OpPatternKind> op_patterns;
-  const auto ConvertToPattern = [&op_pattern_map](const ::pir::Operation* op) {
-    const std::string cinn_op_name = CompatibleInfo::OpName(*op);
-    const hlir::framework::Operator* cinn_op = Operator::Get(cinn_op_name);
-    return op_pattern_map[cinn_op];
-  };
-  std::transform(ops.begin(),
-                 ops.end(),
-                 std::back_inserter(op_patterns),
-                 ConvertToPattern);
-  return op_patterns;
+static Expr CopyedReplaceExpr(const Expr& source,
+                              const std::vector<Var>& replaced,
+                              const std::vector<Expr>& candidates) {
+  CHECK_EQ(replaced.size(), candidates.size())
+      << "In ReplaceExpr, the size of Vars to be replaced must be equal to "
+          "the "
+          "size of cadidate Exprs! Please check.";
+  auto copyed_source = ir::ir_utils::IRCopy(source);
+  if (replaced.empty()) return copyed_source;
+  std::map<Var, Expr, ir::CompVar> replacing_map;
+  for (int i = 0; i < replaced.size(); ++i) {
+    // If the Var to be replaced is equal to the candidate, we skip it.
+    if (candidates[i].is_var() && candidates[i].as_var_ref() == replaced[i])
+      continue;
+    replacing_map[replaced[i]] = candidates[i];
+  }
+  ir::MappingVarToExprMutator mapper(replacing_map);
+  mapper(&copyed_source);
+  return copyed_source;
 }
 
-template <class A, class C, class Func>
-void SequenceMutator(const std::vector<A>& as, C* acc, const Func& mutator) {
-  VLOG(4) << "SequenceTransform Init: " << acc;
-  for (int i = 0; i < as.size(); ++i) {
-    mutator(as[i], acc);
-    VLOG(4) << "SequenceTransform Iter: " << acc;
-  }
+static void SubstitudeTargetExprWithDestExpr(const ir::Expr& source,
+                                              const ir::Expr& dest,
+                                              ir::Expr* body) {
+  VLOG(4) << "Start SubstitudeTargetExprWithDestExpr";
+  MappingLoadStoreExprToDestExprMutator mapper(source, dest);
+  mapper(body);
+  VLOG(4) << "End SubstitudeTargetExprWithDestExpr";
 }
 
-struct TrivialOp {
- private:
-  ir::Expr func_body;
+static ir::Expr SubstitudeIndexVector(const Expr& source,
+                                        const std::vector<Var>& load_vars,
+                                        const std::vector<ir::Expr>& indices) {
+  return CopyedReplaceExpr(source, load_vars, indices);
+}
+
+template<typename FusionOp>
+static void ReplaceDownstreamLoadExprWithUpstreamComputeBody(
+    const FusionOp& upstream,
+    const ir::Expr& downstream_load_expr,
+    ir::Expr* downstream_body) {
+  ComposeUtils::SubstitudeTargetExprWithDestExpr(
+      downstream_load_expr,
+      ComposeUtils::SubstitudeIndexVector(upstream.GetStoreValue(), 
+        upstream.GetOutputIters(), downstream_load_expr.As<ir::Load>()->indices),
+      downstream_body);
+}
 
+std::set<Expr> GetStoreFromBody(const ir::Expr& body) {
+  std::set<Expr> store_tensor_exprs =
+      cinn::ir::ir_utils::CollectIRNodesWithoutTensor(
+          body, [](const Expr* expr) {
+            return expr->As<ir::Store>() &&
+                    expr->As<ir::Store>()->is_addr_tensor();
+          });
+  
+  return store_tensor_exprs;
+}
+
+}
+
+struct TrivialOp {
  public:
+  explicit TrivialOp(const ir::Expr& origin_func_body) {
+    func_body = ir::ir_utils::IRCopy(origin_func_body);
+  }
+
   ir::Expr GetStoreValue() const {
-    return GetStoreFromBody(func_body).As<ir::Store>()->value;
+    return GetSingleStoreExpr(func_body).As<ir::Store>()->value;
   }
 
   ir::Expr* GetStoreValuePointer() const {
-    return &GetStoreFromBody(func_body).As<ir::Store>()->value;
+    return &GetSingleStoreExpr(func_body).As<ir::Store>()->value;
   }
 
   std::vector<ir::Var> GetOutputIters() const {
     std::vector<ir::Var> vars;
-    const auto& indices = GetStoreFromBody(func_body).As<ir::Store>()->indices;
+    const auto& indices = GetSingleStoreExpr(func_body).As<ir::Store>()->indices;
     std::transform(indices.begin(),
                    indices.end(),
                    std::back_inserter(vars),
@@ -119,14 +236,10 @@ struct TrivialOp {
     return vars;
   }
 
-  ir::Expr GetFuncBody() { return func_body; }
+  ir::Expr GetFuncBody() const { return func_body; }
 
   ir::Tensor GetOutputTensor() const {
-    return GetStoreFromBody(func_body).As<ir::Store>()->tensor.as_tensor_ref();
-  }
-
-  explicit TrivialOp(const ir::Expr& origin_func_body) {
-    func_body = ir::ir_utils::IRCopy(origin_func_body);
+    return GetSingleStoreExpr(func_body).As<ir::Store>()->tensor.as_tensor_ref();
   }
 
   std::vector<ir::Expr> GetEachTensorLoadExpr(const ir::Tensor& tensor) const {
@@ -144,156 +257,122 @@ struct TrivialOp {
     return std::vector(load_exprs.begin(), load_exprs.end());
   }
 
-  static TrivialOp Compose(const TrivialOp& upstream,
-                           const ir::Tensor replaced_tensor,
-                           const TrivialOp& downstream) {
-    // ADT :
-    //    Compose :: TrivialOp -> tToReplace Tensor -> TrivialOp -> TrivialOp
-    VLOG(4) << "Compose start:";
-    VLOG(4) << "connected tensor is:" << replaced_tensor;
-    VLOG(4) << "store value is :" << downstream.GetStoreValue();
-    TrivialOp ret(ir::ir_utils::IRCopy(downstream.func_body));
-    SequenceMutator(
-        ret.GetEachTensorLoadExpr(replaced_tensor),
-        ret.GetStoreValuePointer(),
-        [&](const ir::Expr& downstream_load_expr, ir::Expr* downstream_body) {
-          ReplaceDownstreamLoadExprWithUpstreamComputeBody(
-              upstream, downstream_load_expr, downstream_body);
-        });
-    VLOG(4) << "After mutate, store_value is: " << ret.func_body;
-    return ret;
+ private:
+  ir::Expr func_body;
+
+  ir::Expr GetSingleStoreExpr(const ir::Expr& body) const{
+      const auto& store_tensor_exprs = ComposeUtils::GetStoreFromBody(body);
+      PADDLE_ENFORCE(store_tensor_exprs.size() == 1,
+                  "TrivialOp must store for output only once.");
+      return *(store_tensor_exprs.begin());
   }
 
-  static void SubstitudeTargetExprWithDestExpr(const ir::Expr& source,
-                                               const ir::Expr& dest,
-                                               ir::Expr* body) {
-    VLOG(4) << "Start SubstitudeTargetExprWithDestExpr";
-    MappingLoadStoreExprToDestExprMutator mapper(source, dest);
-    mapper(body);
-    VLOG(4) << "End SubstitudeTargetExprWithDestExpr";
+};
+
+struct ReduceOp {
+ public:
+  explicit ReduceOp(const ir::Expr& origin_func_body) {
+    func_body = ir::ir_utils::IRCopy(origin_func_body);
   }
 
-  static void ReplaceDownstreamLoadExprWithUpstreamComputeBody(
-      const TrivialOp& upstream,
-      const ir::Expr& downstream_load_expr,
-      ir::Expr* downstream_body) {
-    SubstitudeTargetExprWithDestExpr(
-        downstream_load_expr,
-        SubstitudeIndexVector(downstream_load_expr.As<ir::Load>()->indices,
-                              upstream),
-        downstream_body);
+  ir::Expr GetStoreValue() const {
+    return GetSingleStoreExpr(func_body).As<ir::Store>()->value;
   }
 
-  static ir::Expr SubstitudeIndexVector(const std::vector<ir::Expr>& indices,
-                                        const TrivialOp& op) {
-    // VLOG(4) << "SubstitudeIndexVector: " <<
-    // CopyedReplaceExpr(op.GetStoreValue(), op.GetOutputIters(), indices);
-    return CopyedReplaceExpr(op.GetStoreValue(), op.GetOutputIters(), indices);
+  ir::Expr* GetStoreValuePointer() const {
+    return &GetSingleStoreExpr(func_body).As<ir::Store>()->value;
   }
 
- private:
-  static ir::Expr GetStoreFromBody(const ir::Expr& body) {
-    std::set<Expr> store_tensor_exprs =
-        cinn::ir::ir_utils::CollectIRNodesWithoutTensor(
-            body, [](const Expr* expr) {
-              return expr->As<ir::Store>() &&
-                     expr->As<ir::Store>()->is_addr_tensor();
-            });
-    PADDLE_ENFORCE(store_tensor_exprs.size() == 1,
-                   "TrivialOp must store for output only once.");
-    return (*store_tensor_exprs.begin());
-  }
-  static Expr CopyedReplaceExpr(const Expr& source,
-                                const std::vector<Var>& replaced,
-                                const std::vector<Expr>& candidates) {
-    CHECK_EQ(replaced.size(), candidates.size())
-        << "In ReplaceExpr, the size of Vars to be replaced must be equal to "
-           "the "
-           "size of cadidate Exprs! Please check.";
-    auto copyed_source = ir::ir_utils::IRCopy(source);
-    if (replaced.empty()) return copyed_source;
-    std::map<Var, Expr, ir::CompVar> replacing_map;
-    for (int i = 0; i < replaced.size(); ++i) {
-      // If the Var to be replaced is equal to the candidate, we skip it.
-      if (candidates[i].is_var() && candidates[i].as_var_ref() == replaced[i])
-        continue;
-      replacing_map[replaced[i]] = candidates[i];
-    }
-    ir::MappingVarToExprMutator mapper(replacing_map);
-    mapper(&copyed_source);
-    return copyed_source;
+  std::vector<ir::Var> GetOutputIters() const {
+    std::vector<ir::Var> vars;
+    const auto& indices = GetSingleStoreExpr(func_body).As<ir::Store>()->indices;
+    std::transform(indices.begin(),
+                   indices.end(),
+                   std::back_inserter(vars),
+                   [](const ir::Expr& expr) { return expr.as_var_ref(); });
+    return vars;
   }
-};
 
-struct ReduceOp {
- private:
-  ir::Expr func_body;
+  ir::Expr GetFuncBody() const { return func_body; }
 
- public:
-};
+  ir::Tensor GetOutputTensor() const {
+    return GetSingleStoreExpr(func_body).As<ir::Store>()->tensor.as_tensor_ref();
+  }
 
-static bool IsAdjecent(const ir::Expr& upstream, const ir::Expr& downstream) {
-  // 1. Get inputs / output from Expr, then we can tell whether they are
-  // adjecent.
-  std::set<Expr> upstream_stores =
-      cinn::ir::ir_utils::CollectIRNodesWithoutTensor(
-          upstream, [](const Expr* expr) {
-            return expr->As<ir::Store>() &&
-                   expr->As<ir::Store>()->is_addr_tensor();
-          });
-  // don't support multi-output yet.
-  PADDLE_ENFORCE(upstream_stores.size() == 1,
-                 "The expr of injective should have only one store");
+  std::vector<ir::Expr> GetEachTensorLoadExpr(const ir::Tensor& tensor) const {
+    VLOG(4) << "Start GetEachTensorLoadExpr: " << tensor;
+    std::set<Expr> load_exprs = cinn::ir::ir_utils::CollectIRNodesWithoutTensor(
+        GetStoreValue(), [&tensor](const Expr* expr) {
+          return expr->As<ir::Load>() &&
+                 expr->As<ir::Load>()->is_addr_tensor() &&
+                 expr->As<ir::Load>()->tensor.as_tensor_ref()->name ==
+                     tensor->name;
+        });
+    for (auto& t : load_exprs) {
+      VLOG(4) << "GetEachTensorLoadExpr: " << t << " " << t.ptr();
+    }
+    return std::vector(load_exprs.begin(), load_exprs.end());
+  }
 
-  std::set<Expr> downstream_loads =
-      cinn::ir::ir_utils::CollectIRNodesWithoutTensor(
-          downstream, [](const Expr* expr) {
-            return expr->As<ir::Load>() &&
-                   expr->As<ir::Load>()->is_addr_tensor();
-          });
+ private:
+  ir::Expr func_body;
 
-  for (const auto& upstream_store : upstream_stores) {
-    for (const auto& downstream_load : downstream_loads) {
-      if (upstream_store.As<ir::Store>()->tensor.As<ir::_Tensor_>()->name ==
-          downstream_load.As<ir::Load>()->tensor.As<ir::_Tensor_>()->name) {
-        return true;
-      }
+  ir::Expr GetSingleStoreExpr(const ir::Expr& body) const{
+    std::vector<ir::Expr> store_tensor_exprs;
+    for(const ir::Expr& store_expr: ComposeUtils::GetStoreFromBody(body)){
+      std::string store_name = store_expr.As<ir::Store>()->tensor.As<ir::_Tensor_>()->name;
+      if (store_name.find("reduce_init") != std::string::npos)
+        continue;
+      store_tensor_exprs.emplace_back(store_expr);
     }
+
+    PADDLE_ENFORCE(store_tensor_exprs.size() == 1,
+                "ReduceOp must store for output only once.");
+    return *(store_tensor_exprs.begin());
   }
-  return false;
-}
+};
 
-bool IsTrivialKind(OpPatternKind kind) {
-  return kind == OpPatternKind::kElementWise ||
-         kind == OpPatternKind::kBroadcast || kind == OpPatternKind::kInjective;
-}
+ir::Expr TTFusion(ir::Expr upper, ir::Expr down) {
+  VLOG(4) << "TTFusion begin.";
+  TrivialOp upstream(upper);
+  TrivialOp downstream(down);
+  const auto& replaced_tensor = upstream.GetOutputTensor();
+  VLOG(4) << "connected tensor is:" << replaced_tensor;
+  VLOG(4) << "store value is :" << downstream.GetStoreValue();
+
+  TrivialOp fused(ir::ir_utils::IRCopy(downstream.GetFuncBody()));
+  SequenceMutator(
+      fused.GetEachTensorLoadExpr(replaced_tensor),
+      fused.GetStoreValuePointer(),
+      [&](const ir::Expr& downstream_load_expr, ir::Expr* downstream_body) {
+        ComposeUtils::ReplaceDownstreamLoadExprWithUpstreamComputeBody(
+            upstream, downstream_load_expr, downstream_body);
+      });
 
-ir::Expr TrivialFusion(ir::Expr upper, ir::Expr down) {
-  VLOG(4) << "TrivalFusion begin.";
-  TrivialOp upper_op(upper);
-  TrivialOp down_op(down);
-  VLOG(4) << "Compose begin.";
-  auto fused =
-      TrivialOp::Compose(upper_op, upper_op.GetOutputTensor(), down_op);
-  VLOG(4) << "TrivalFusion end:" << fused.GetFuncBody();
+  VLOG(4) << "After mutate, store_value is: " << fused.GetFuncBody();
+  VLOG(4) << "TTFusion end:" << fused.GetFuncBody();
   return fused.GetFuncBody();
 }
 
+ir::Expr TRFusion(ir::Expr upper, ir::Expr down) {
+  VLOG(4) << "TRFusion begin.";
+  TrivialOp upstream(upper);
+  ReduceOp downstream(down);
+  const auto& replaced_tensor = upstream.GetOutputTensor();
+  VLOG(4) << "connected tensor is:" << replaced_tensor;
+  VLOG(4) << "store value is :" << downstream.GetStoreValue();
+
+  ReduceOp fused(ir::ir_utils::IRCopy(downstream.GetFuncBody()));
+  SequenceMutator(
+      fused.GetEachTensorLoadExpr(replaced_tensor),
+      fused.GetStoreValuePointer(),
+      [&](const ir::Expr& downstream_load_expr, ir::Expr* downstream_body) {
+        ComposeUtils::ReplaceDownstreamLoadExprWithUpstreamComputeBody(
+            upstream, downstream_load_expr, downstream_body);
+      });
 
-void CheckFusionInputValid(const std::vector<ir::Expr>& op_compute_bodies,
-                           const std::vector<OpPatternKind>& op_patterns) {
-  if (VLOG_IS_ON(4)) {
-    for (const auto& func : op_compute_bodies) {
-      VLOG(4) << "TrivialOpFusion: {FuncBody is} :" << func;
-    }
-    for (const auto& op_ptn : op_patterns) {
-      VLOG(4) << "OpPattern is :" << op_ptn;
-    }
-  }
-  VLOG(4) << "      op_patterns.size() = " << op_compute_bodies.size();
-  VLOG(4) << "op_compute_bodies.size() = " << op_patterns.size();
-  PADDLE_ENFORCE_EQ(
-      op_patterns.size(), op_compute_bodies.size(), "ops and  size not equal");
+  VLOG(4) << "TRFusion end:" << fused.GetFuncBody();
+  return fused.GetFuncBody();
 }
 
 struct FusionNode {
@@ -326,12 +405,11 @@ struct FusionNode {
       ::pir::Value related_value = pair_data.second;
       if (upstream_node->downstream.find(fused_up_node) != upstream_node->downstream.end()){
         upstream_node->downstream.erase(fused_up_node);
-        upstream_node->downstream[this] = related_value;
       }
       if (upstream_node->downstream.find(fused_down_node) != upstream_node->downstream.end()){
         upstream_node->downstream.erase(fused_down_node);
-        upstream_node->downstream[this] = related_value;
       }
+      upstream_node->downstream[this] = related_value;
     }
 
     for (const auto& pair_data: downstream){
@@ -339,12 +417,11 @@ struct FusionNode {
       ::pir::Value related_value = pair_data.second;
       if (downstream_node->upstream.find(fused_up_node) != downstream_node->upstream.end()){
         downstream_node->upstream.erase(fused_up_node);
-        downstream_node->upstream[this] = related_value;
       }
       if (downstream_node->upstream.find(fused_down_node) != downstream_node->upstream.end()){
         downstream_node->upstream.erase(fused_down_node);
-        downstream_node->upstream[this] = related_value;
       }
+      downstream_node->upstream[this] = related_value;
     }
   }
 
@@ -357,6 +434,7 @@ struct FusionGraph {
       const std::vector<ir::Expr>& op_compute_bodies){
 
     // shardable_axes_ = InferShardableAxes(ops);
+    VLOG(4) << "CreateFusionGraph";
 
     const auto& op_patterns = GetOpPatternKindVector(ops);
     CheckFusionInputValid(op_compute_bodies, op_patterns);
@@ -414,7 +492,7 @@ struct FusionGraph {
   }
 
   std::vector<ir::Expr> DoFusion(){
-    trivial_op_fusion();
+    fuse_trivial_node();
     return get_expr_results();
   }
 
@@ -429,17 +507,29 @@ struct FusionGraph {
     return nullptr;
   }
 
-  void trivial_op_fusion(){
+  void fuse_trivial_node(){
     FusionNode* upstream;
     while((upstream = find_trivial_node()) != nullptr){
-      for (const auto& pair_data : upstream->downstream){
+      while(!upstream->downstream.empty()){
+        const auto& pair_data = *(upstream->downstream.begin());
         FusionNode* downstream = pair_data.first;
+        upstream->downstream.erase(downstream);
+
         CHECK(downstream->op_compute_body.size() == 1);
 
-        FusionNode* new_node = new FusionNode(
-          TrivialFusion(upstream->op_compute_body[0], downstream->op_compute_body[0]),
-          downstream->op_pattern
-        );
+        FusionNode* new_node;
+        if (IsTrivialKind(downstream->op_pattern)){
+          new_node = new FusionNode(
+            TTFusion(upstream->op_compute_body[0], downstream->op_compute_body[0]),
+            downstream->op_pattern
+          );
+        }else{
+          new_node = new FusionNode(
+            TRFusion(upstream->op_compute_body[0], downstream->op_compute_body[0]),
+            downstream->op_pattern
+          );
+        }
+
         new_node->replace_topo_structure_of_fused_nodes(upstream, downstream);
         append_fusion_node(new_node);
         remove_fusion_node(downstream);
@@ -529,7 +619,7 @@ std::vector<FusionNode> FuseEachUpstreamUse(
       std::back_inserter(fused_nodes),
       [&](const FusionNode& downstream_node) {
         if (IsAdjecentInjectiveBetween(upstream_node, downstream_node)) {
-          return FusionNode(TrivialFusion(upstream_node.op_compute_body[0],
+          return FusionNode(TTFusion(upstream_node.op_compute_body[0],
                                           downstream_node.op_compute_body[0]),
                             OpPatternKind::kInjective);
         }

From cf96b675601d88e5548039b7a256707581dc6fd7 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Mon, 11 Mar 2024 10:07:30 +0800
Subject: [PATCH 101/114] fix bug of fuse shape ops to generate_shape (#62587)

---
 .../transforms/fuse_shape_ops_into_generate_shape_op_pass.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
index 0b0d4b4de9ebc..2bcc35173f4b5 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/fuse_shape_ops_into_generate_shape_op_pass.cc
@@ -26,6 +26,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/transforms/transform_general_functions.h"
 #include "paddle/pir/include/core/builtin_dialect.h"
 #include "paddle/pir/include/dialect/shape/utils/dim_expr.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
@@ -57,8 +58,8 @@ std::vector<pir::Value> FindSourceDenseTensorOfDimTensor(
         // find input dimension tensor;
         pir::Operation* owner = value.defining_op();
         if (owner == nullptr) return;
-        for (int i = 0; i < owner->num_operands(); ++i) {
-          Visit(owner->operand_source(i));
+        for (auto input_value : pir::GetUsedExternalValue(*owner)) {
+          Visit(input_value);
         }
       };
   const auto& IsDimTensorOrListDimExpr = symbol::Overloaded{

From d45efa20ece507bbba3f0652c88ba01c24176c29 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@qq.com>
Date: Mon, 11 Mar 2024 10:17:59 +0800
Subject: [PATCH 102/114] cinn(op): fix broadcast op (#62594)

---
 paddle/cinn/hlir/pe/broadcast.cc | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc
index 9ab00fc8ce5da..2348546149669 100644
--- a/paddle/cinn/hlir/pe/broadcast.cc
+++ b/paddle/cinn/hlir/pe/broadcast.cc
@@ -23,6 +23,7 @@
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
+#include "paddle/common/errors.h"
 PD_DECLARE_bool(cinn_bucket_compile);
 
 namespace cinn {
@@ -376,16 +377,20 @@ Tensor BroadcastTo(const Tensor& A,
                    const std::vector<ir::Expr>& out_shape,
                    const std::string& out_name) {
   auto A_shape = A->shape;
-  CHECK_EQ(A_shape.size(), out_shape.size())
-      << "broadcast_to's out_shape's size should be same with the input "
-         "shape's size";
+  PADDLE_ENFORCE_GE(
+      out_shape.size(),
+      A_shape.size(),
+      ::common::errors::InvalidArgument(
+          "broadcast_to's out_shape's size should be GreaterEqual "
+          "with the input shape's size"));
 
   return Compute(
       ToCinnExprs(out_shape),
       [=](const std::vector<Expr>& indice) {
         std::vector<Expr> broadcast_indice;
-        for (int idx = 0; idx < out_shape.size(); ++idx) {
-          ir::Expr a_shape_i = A_shape[idx];
+        int out_A_offset = out_shape.size() - A_shape.size();
+        for (int idx = out_A_offset; idx < out_shape.size(); ++idx) {
+          ir::Expr a_shape_i = A_shape[idx - out_A_offset];
           if (MathEqual(a_shape_i, ir::Expr(1))) {
             broadcast_indice.push_back(ir::Expr(0));
           } else if (MathEqual(a_shape_i, out_shape[idx])) {

From 01f01c397a0c33d92a4506c49cd63efd6cf4983c Mon Sep 17 00:00:00 2001
From: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Date: Mon, 11 Mar 2024 10:24:44 +0800
Subject: [PATCH 103/114] add inference
 api:exp_specify_tensorrt_subgraph_precision (#62402)

add inference api:exp_specify_tensorrt_subgraph_precision (#62402)
---
 paddle/fluid/inference/analysis/argument.h    |   9 ++
 .../inference/analysis/ir_pass_manager.cc     |   9 ++
 .../ir_passes/tensorrt_subgraph_pass.cc       |  40 ++++-
 paddle/fluid/inference/api/analysis_config.cc |  24 +++
 .../fluid/inference/api/analysis_predictor.cc |   3 +
 .../inference/api/paddle_analysis_config.h    |  22 +++
 paddle/fluid/pybind/inference_api.cc          |   2 +
 .../test_trt_ops_fp16_mix_precision.py        | 144 ++++++++++++++++++
 8 files changed, 252 insertions(+), 1 deletion(-)
 create mode 100644 test/ir/inference/test_trt_ops_fp16_mix_precision.py

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 8c4fbceced1ab..aeaa305191974 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -256,6 +256,15 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_disabled_ops,
                       TensorRtDisabledOPs,
                       std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(trt_parameter_run_fp16,
+                      TRTParameterRunFp16,
+                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(trt_parameter_run_int8,
+                      TRTParameterRunInt8,
+                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(trt_parameter_run_bfp16,
+                      TRTParameterRunBfp16,
+                      std::vector<std::string>);
   DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, int);
   DECL_ARGUMENT_FIELD(tensorrt_use_static_engine,
                       TensorRtUseStaticEngine,
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index cc126e5fea612..57fd4fb7c311a 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -173,6 +173,15 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set(
           "trt_exclude_var_names",
           new std::vector<std::string>(argument->trt_exclude_var_names()));
+      pass->Set(
+          "trt_parameter_run_fp16",
+          new std::vector<std::string>(argument->trt_parameter_run_fp16()));
+      pass->Set(
+          "trt_parameter_run_int8",
+          new std::vector<std::string>(argument->trt_parameter_run_int8()));
+      pass->Set(
+          "trt_parameter_run_bfp16",
+          new std::vector<std::string>(argument->trt_parameter_run_bfp16()));
       pass->Set("forbid_dynamic_op",
                 new bool(argument->trt_forbid_dynamic_op()));
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index d6441cc6d4a56..db185b15c03d9 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -14,7 +14,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
-
 #include <fcntl.h>
 #include <cstddef>
 #include <memory>
@@ -476,9 +475,47 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   }
   auto precision_mode =
       static_cast<phi::DataType>(Get<int>("trt_precision_mode"));
+  auto trt_params_run_fp16 =
+      Get<std::vector<std::string>>("trt_parameter_run_fp16");
+  auto trt_params_run_int8 =
+      Get<std::vector<std::string>>("trt_parameter_run_int8");
+  auto trt_params_run_bfp16 =
+      Get<std::vector<std::string>>("trt_parameter_run_bfp16");
+
+  for (const auto &para : parameters) {
+    if (std::find(trt_params_run_fp16.begin(),
+                  trt_params_run_fp16.end(),
+                  para) != trt_params_run_fp16.end()) {
+      precision_mode = phi::DataType::FLOAT16;
+      break;
+    }
+  }
+
   bool enable_fp16 = false;
   if (precision_mode == phi::DataType::FLOAT16) enable_fp16 = true;
   auto enable_int8 = Get<bool>("enable_int8");
+
+  for (const auto &para : parameters) {
+    if (std::find(trt_params_run_int8.begin(),
+                  trt_params_run_int8.end(),
+                  para) != trt_params_run_int8.end()) {
+      enable_int8 = true;
+      precision_mode = phi::DataType::INT8;
+      break;
+    }
+  }
+
+  for (const auto &para : parameters) {
+    if (std::find(trt_params_run_bfp16.begin(),
+                  trt_params_run_bfp16.end(),
+                  para) != trt_params_run_bfp16.end()) {
+      precision_mode = phi::DataType::BFLOAT16;
+      break;
+    }
+  }
+  bool enable_bfp16 = false;
+  if (precision_mode == phi::DataType::BFLOAT16) enable_bfp16 = true;
+
   auto use_calib_mode = Get<bool>("use_calib_mode");
   auto &subgraph_nodes = *framework::ir::Agent(node).subgraph();
   auto min_input_shape =
@@ -724,6 +761,7 @@ std::string TensorRtSubgraphPass::CreateTensorRTOp(
   op_desc->SetAttr("calibration_data", calibration_data);
   op_desc->SetAttr("enable_int8", enable_int8);
   op_desc->SetAttr("enable_fp16", enable_fp16);
+  op_desc->SetAttr("enbale_bfp16", enable_bfp16);
   op_desc->SetAttr("use_calib_mode", use_calib_mode);
   op_desc->SetAttr("engine_key", engine_key);
   op_desc->SetAttr("calibration_engine_key", calibration_engine_key);
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 5ab33c65208a3..d97e41f0b1e13 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -462,6 +462,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(tensorrt_min_subgraph_size_);
   CP_MEMBER(tensorrt_precision_mode_);
   CP_MEMBER(trt_mark_output_);
+  CP_MEMBER(trt_parameters_run_fp16_);
+  CP_MEMBER(trt_parameters_run_int8_);
+  CP_MEMBER(trt_parameters_run_bfp16_);
   CP_MEMBER(trt_forbid_dynamic_op_)
   CP_MEMBER(trt_output_tensor_names_);
   CP_MEMBER(trt_disabled_ops_);
@@ -880,6 +883,21 @@ void AnalysisConfig::Exp_DisableTensorRtSubgraph(
                                 var_name_not_trt.end());
 }
 
+void AnalysisConfig::Exp_SpecifyTensorRTSubgraphPrecision(
+    const std::vector<std::string> &trt_parameters_run_fp16,
+    const std::vector<std::string> &trt_parameters_run_int8,
+    const std::vector<std::string> &trt_parameters_run_bfp16) {
+  trt_parameters_run_fp16_.insert(trt_parameters_run_fp16_.end(),
+                                  trt_parameters_run_fp16.begin(),
+                                  trt_parameters_run_fp16.end());
+  trt_parameters_run_int8_.insert(trt_parameters_run_int8_.end(),
+                                  trt_parameters_run_int8.begin(),
+                                  trt_parameters_run_int8.end());
+  trt_parameters_run_bfp16_.insert(trt_parameters_run_bfp16_.end(),
+                                   trt_parameters_run_bfp16.begin(),
+                                   trt_parameters_run_bfp16.end());
+}
+
 void AnalysisConfig::EnableVarseqlen() { trt_use_varseqlen_ = true; }
 
 void AnalysisConfig::SetTensorRtOptimizationLevel(int level) {
@@ -1135,6 +1153,12 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << tensorrt_max_batchsize_;
   ss << tensorrt_min_subgraph_size_;
   ss << trt_mark_output_;
+  for (auto &name : trt_parameters_run_fp16_) ss << name.c_str();
+  ss << ";";
+  for (auto &name : trt_parameters_run_int8_) ss << name.c_str();
+  ss << ";";
+  for (auto &name : trt_parameters_run_bfp16_) ss << name.c_str();
+  ss << ";";
   ss << trt_forbid_dynamic_op_;
 
   ss << use_dlnne_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 961c0e350be38..8be9fa420318c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1759,6 +1759,9 @@ void AnalysisPredictor::PrepareArgument() {
     argument_->SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
     argument_->SetTRTMarkOutput(config_.trt_mark_output_);
     argument_->SetTRTOutputTensorNames(config_.trt_output_tensor_names_);
+    argument_->SetTRTParameterRunFp16(config_.trt_parameters_run_fp16_);
+    argument_->SetTRTParameterRunInt8(config_.trt_parameters_run_int8_);
+    argument_->SetTRTParameterRunBfp16(config_.trt_parameters_run_bfp16_);
     argument_->SetTensorRtDisabledOPs(config_.trt_disabled_ops_);
     argument_->SetTRTExcludeVarNames(config_.trt_exclude_var_names_);
     argument_->SetTRTForbidDynamicOp(config_.trt_forbid_dynamic_op_);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 2c5b254ea1c14..251f390b9afda 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -810,9 +810,27 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void Exp_DisableTensorRtOPs(const std::vector<std::string>& ops);
 
+  ///
+  /// \brief Prevent TensorRtSubgraph running in Paddle-TRT
+  /// NOTE: just experimental, not an official stable API, easy to be broken.
+  ///
   void Exp_DisableTensorRtSubgraph(
       const std::vector<std::string>& var_name_not_trt);
 
+  ///
+  /// \brief Specify TensorRT subgraph precision,fp16, int8 or bfp16(TensorRT
+  /// Version>=9.0) NOTE: just experimental, not an official stable API, easy to
+  /// be broken.
+  ///
+  void Exp_SpecifyTensorRTSubgraphPrecision(
+      const std::vector<std::string>& trt_parameters_fp16,
+      const std::vector<std::string>& trt_parameters_int8,
+      const std::vector<std::string>& trt_parameters_bfp16);
+
+  ///
+  /// \brief Prevent DynamicShape OPs running in Paddle-TRT
+  /// NOTE: just experimental, not an official stable API, easy to be broken.
+  ///
   void Exp_DisableTensorRTDynamicShapeOPs(bool trt_forbid_dynamic_op);
 
   ///
@@ -1289,6 +1307,10 @@ struct PD_INFER_DECL AnalysisConfig {
 
   std::vector<std::string> trt_output_tensor_names_{};
   std::vector<std::string> trt_exclude_var_names_{};
+  std::vector<std::string> trt_parameters_run_fp16_{};
+  std::vector<std::string> trt_parameters_run_int8_{};
+  std::vector<std::string> trt_parameters_run_bfp16_{};
+
   std::string tensorrt_transformer_posid_{""};
   std::string tensorrt_transformer_maskid_{""};
   bool trt_use_dla_{false};
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 69cb7303ea4e8..e5c3ffd15bb72 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -937,6 +937,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
       .def("exp_disable_tensorrt_subgraph",
            &AnalysisConfig::Exp_DisableTensorRtSubgraph)
+      .def("exp_specify_tensorrt_subgraph_precision",
+           &AnalysisConfig::Exp_SpecifyTensorRTSubgraphPrecision)
       .def("exp_disable_tensorrt_dynamic_shape_ops",
            &AnalysisConfig::Exp_DisableTensorRTDynamicShapeOPs)
       .def("enable_tensorrt_dla",
diff --git a/test/ir/inference/test_trt_ops_fp16_mix_precision.py b/test/ir/inference/test_trt_ops_fp16_mix_precision.py
new file mode 100644
index 0000000000000..f950f3bca8bf4
--- /dev/null
+++ b/test/ir/inference/test_trt_ops_fp16_mix_precision.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import nn, static
+from paddle.inference import Config, PrecisionType, create_predictor
+
+paddle.enable_static()
+
+
+class SimpleNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2D(
+            in_channels=4,
+            out_channels=4,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+        self.relu1 = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            in_channels=4,
+            out_channels=2,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+        self.relu2 = nn.ReLU()
+        self.conv3 = nn.Conv2D(
+            in_channels=2,
+            out_channels=1,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+        self.relu3 = nn.ReLU()
+        self.flatten = nn.Flatten()
+        self.fc = nn.Linear(729, 10)
+        self.softmax = nn.Softmax()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.relu1(x)
+        x = self.conv2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        x = self.relu3(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        x = self.softmax(x)
+        return x
+
+
+class TestTRTOptimizationLevel(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(self.temp_dir.name, 'optimization_level', '')
+        self.model_prefix = self.path + 'infer_model'
+
+    def tearDown(self):
+        shutil.rmtree(self.path)
+
+    def build_model(self):
+        image = static.data(
+            name='img', shape=[None, 4, 224, 224], dtype='float32'
+        )
+        predict = SimpleNet()(image)
+        exe = paddle.static.Executor(self.place)
+        exe.run(paddle.static.default_startup_program())
+        paddle.static.save_inference_model(
+            self.model_prefix, [image], [predict], exe
+        )
+
+    def init_predictor(self):
+        config = Config(
+            self.model_prefix + '.pdmodel', self.model_prefix + '.pdiparams'
+        )
+        config.enable_use_gpu(256, 0, PrecisionType.Float32)
+        config.exp_disable_tensorrt_ops(["relu_1.tmp_0"])
+        config.enable_tensorrt_engine(
+            workspace_size=1 << 30,
+            max_batch_size=1,
+            min_subgraph_size=3,
+            precision_mode=PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False,
+        )
+
+        config.exp_specify_tensorrt_subgraph_precision(
+            ["conv2d_1.w_0"], [""], ["conv2d_2.w_0"]
+        )
+
+        config.enable_memory_optim()
+        # config.disable_glog_info()
+        config.set_tensorrt_optimization_level(0)
+        self.assertEqual(config.tensorrt_optimization_level(), 0)
+        predictor = create_predictor(config)
+        return predictor
+
+    def infer(self, predictor, img):
+        input_names = predictor.get_input_names()
+        for i, name in enumerate(input_names):
+            input_tensor = predictor.get_input_handle(name)
+            input_tensor.reshape(img[i].shape)
+            input_tensor.copy_from_cpu(img[i].copy())
+
+        predictor.run()
+        results = []
+        output_names = predictor.get_output_names()
+        for i, name in enumerate(output_names):
+            output_tensor = predictor.get_output_handle(name)
+            output_data = output_tensor.copy_to_cpu()
+            results.append(output_data)
+        return results
+
+    def test_optimization_level(self):
+        self.build_model()
+        predictor = self.init_predictor()
+        img = np.ones((1, 4, 224, 224), dtype=np.float32)
+        results = self.infer(predictor, img=[img])
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2c924ed238182f920e7cbd450d4021926bed84fa Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Mon, 11 Mar 2024 10:26:43 +0800
Subject: [PATCH 104/114] add matmul shape constrain (#62567)

---
 .../paddle_op_infer_sym.cc                    | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index 4d3f0222de40c..ee4f2d406b3a2 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -844,6 +844,25 @@ bool MatmulOpInferSymbolicShape(
   shape_analysis->SetShapeOrDataForValue(op->result(0),
                                          ShapeOrData{TensorExprs(out_dims)});
 
+  if ((ndims_x == ndims_y) && ndims_x >= 2) {
+    if (transpose_x_attr == false && transpose_y_attr == false) {
+      shape_analysis->CreateDimExprBuilder().CstrEq(x_dims[ndims_x - 1],
+                                                    y_dims[ndims_x - 2]);
+    } else if (transpose_x_attr == false && transpose_y_attr == true) {
+      shape_analysis->CreateDimExprBuilder().CstrEq(x_dims[ndims_x - 1],
+                                                    y_dims[ndims_x - 1]);
+    } else if (transpose_x_attr == true && transpose_y_attr == false) {
+      shape_analysis->CreateDimExprBuilder().CstrEq(x_dims[ndims_x - 2],
+                                                    y_dims[ndims_x - 2]);
+    } else {
+      shape_analysis->CreateDimExprBuilder().CstrEq(x_dims[ndims_x - 2],
+                                                    y_dims[ndims_x - 1]);
+    }
+
+    for (size_t i = 0; i < ndims_x - 2; ++i) {
+      shape_analysis->CreateDimExprBuilder().CstrEq(x_dims[i], y_dims[i]);
+    }
+  }
   return true;
 }
 

From e819334426113cbdccec68c340379bd2718a23e1 Mon Sep 17 00:00:00 2001
From: Tianyu Feng <45195157+fty1777@users.noreply.github.com>
Date: Mon, 11 Mar 2024 10:51:45 +0800
Subject: [PATCH 105/114] Symbolic shape inference support for pd_op.split and
 builtin.split (#62394)

* WIP: builtin.split op infer sym shape

* bug fix

* Update paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc

Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>

* Update paddle/fluid/pir/dialect/operator/ir/op_dialect.cc

Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>

* Update paddle/fluid/pir/dialect/operator/ir/op_dialect.cc

Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>

* pd_op.split followed by builtin.split

* pd_op.split infer sym shape bugfix and unittest; fix op infer sym error outputs

* recover SplitWithNumOpInferSymbolicShape Unimplemented exception raising

* code refinement

* Rewrite PADDLE_ENFORCE

* remove incorrect comments

* Rewrite PADDLE_ENFORCE

* Rewrite PADDLE_ENFORCE

---------

Co-authored-by: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
---
 .../paddle_op_infer_sym.cc                    | 94 ++++++++++++++++++-
 .../pir/dialect/operator/ir/op_dialect.cc     | 31 ++++++
 paddle/phi/api/yaml/legacy_ops.yaml           |  1 +
 .../cinn/symbolic/test_op_infer_sym_shape.py  | 81 +++++++++++++++-
 .../symbolic/test_unary_op_infer_sym_shape.py |  2 +-
 5 files changed, 202 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index ee4f2d406b3a2..0d9f6ce5a036c 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -958,8 +958,98 @@ bool ExpandAsOpInferSymbolicShape(
 
 bool SplitOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  // input
+  const auto &x_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+  PADDLE_ENFORCE_EQ(x_shape_or_data.data().has_value(),
+                    false,
+                    phi::errors::InvalidArgument(
+                        "InferSymbolicShape of SplitOp only support input with "
+                        "value now."));
+  const auto &x_dims_sym = x_shape_or_data.shape();
+
+  // axis
+  CHECK(op->operand_source(2).defining_op()->isa<paddle::dialect::FullOp>());
+
+  int64_t axis = op->operand_source(2)
+                     .defining_op<paddle::dialect::FullOp>()
+                     .attributes()
+                     .at("value")
+                     .dyn_cast<paddle::dialect::ScalarAttribute>()
+                     .data()
+                     .to<int64_t>();
+
+  // sections
+  const std::vector<symbol::DimExpr> &sections_sym = [&] {
+    const auto &sections_shape_or_data =
+        shape_analysis->GetShapeOrDataForValue(op->operand_source(1));
+    std::vector<symbol::DimExpr> sections_sym;
+    if (sections_shape_or_data.data().has_value()) {
+      sections_sym = sections_shape_or_data.data().value();
+    } else {
+      sections_sym = sections_shape_or_data.shape();
+    }
+    return sections_sym;
+  }();
+
+  // output
+  const symbol::TensorListShapeOrDataDimExprs &output_shape_data_list = [&] {
+    const auto &GetSum = [&](const auto &dim_exprs, const auto &Filter) {
+      symbol::DimExpr sum{0};
+      for (const auto &dim_expr : dim_exprs) {
+        if (Filter(dim_expr)) {
+          sum = sum + dim_expr;
+        }
+      }
+      return sum;
+    };
+    const auto &All = [&](const auto &dim_exprs, const auto &Cond) {
+      for (const auto &dim_expr : dim_exprs) {
+        if (!Cond(dim_expr)) {
+          return false;
+        }
+      }
+      return true;
+    };
+    const auto &IsNotMinusOne = [&](const symbol::DimExpr &dim_expr) {
+      if (dim_expr.isa<int64_t>()) {
+        return dim_expr.dyn_cast<int64_t>() != static_cast<int64_t>(-1);
+      }
+      return true;
+    };
+    const auto &sum_exclude_minus_one = GetSum(sections_sym, IsNotMinusOne);
+
+    const bool &all_sections_sym_not_minus_one =
+        All(sections_sym, IsNotMinusOne);
+    if (all_sections_sym_not_minus_one) {
+      shape_analysis->CreateDimExprBuilder().CstrEq(x_dims_sym[axis],
+                                                    sum_exclude_minus_one);
+    }
+
+    symbol::TensorListShapeOrDataDimExprs shape_data_list;
+    std::vector<symbol::DimExpr> output_dims_sym = x_dims_sym;
+    if (!all_sections_sym_not_minus_one && sections_sym.size() == 1) {
+      VLOG(3) << "[SplitOp]-1 is the only split section. The output shape is "
+                 "identical to the input shape.";
+      shape_data_list.push_back(
+          symbol::TensorShapeOrDataDimExprs(output_dims_sym));
+      return shape_data_list;
+    }
+    for (uint32_t idx = 0; idx < sections_sym.size(); idx++) {
+      const auto &section_sym = sections_sym[idx];
+      output_dims_sym[axis] = IsNotMinusOne(section_sym)
+                                  ? section_sym
+                                  : x_dims_sym[axis] - sum_exclude_minus_one;
+
+      shape_data_list.push_back(
+          symbol::TensorShapeOrDataDimExprs(output_dims_sym));
+    }
+    return shape_data_list;
+  }();
+
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0), symbol::ShapeOrDataDimExprs{output_shape_data_list});
+
   return true;
 }
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 7262589c7ad3a..1364c1e1e0c77 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -159,6 +159,32 @@ struct ShadowOutputOpInferSymbolicShapeInterfaceModel
       : InferSymbolicShapeInterface::Concept(InferSymbolicShape) {}
 };
 
+struct SplitOpInferSymbolicShapeInterfaceModel
+    : public InferSymbolicShapeInterface::Concept {
+  static inline bool InferSymbolicShape(
+      pir::Operation* op, pir::ShapeConstraintIRAnalysis* shape_analysis) {
+    const auto& shape_data_list =
+        shape_analysis->GetShapeOrDataForValue(op->operand_source(0))
+            .dyn_cast<symbol::TensorListShapeOrDataDimExprs>();
+
+    for (uint32_t rst_idx = 0; rst_idx < op->num_results(); rst_idx++) {
+      PADDLE_ENFORCE_EQ(
+          shape_data_list[rst_idx].data().has_value(),
+          false,
+          paddle::platform::errors::InvalidArgument(
+              "Currently InferSymbolicShape of SplitOp only support "
+              "input without value."));
+      shape_analysis->SetShapeOrDataForValue(
+          op->result(rst_idx),
+          symbol::ShapeOrDataDimExprs{shape_data_list[rst_idx]});
+    }
+    return true;
+  }
+
+  SplitOpInferSymbolicShapeInterfaceModel()
+      : InferSymbolicShapeInterface::Concept(InferSymbolicShape) {}
+};
+
 struct YieldOpInferSymbolicShapeInterfaceModel
     : public InferSymbolicShapeInterface::Concept {
   static inline bool InferSymbolicShape(
@@ -196,6 +222,11 @@ OperatorDialect::OperatorDialect(pir::IrContext* ctx)
                 InferSymbolicShapeInterface,
                 ShadowOutputOpInferSymbolicShapeInterfaceModel>()));
 
+  info = ctx->GetRegisteredOpInfo(pir::SplitOp::name());
+  info.AttachInterface(std::move(
+      pir::InterfaceValue::Get<InferSymbolicShapeInterface,
+                               SplitOpInferSymbolicShapeInterfaceModel>()));
+
   info = ctx->GetRegisteredOpInfo(pir::YieldOp::name());
   info.AttachInterface(std::move(
       pir::InterfaceValue::Get<InferSymbolicShapeInterface,
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index e27e5de111bc8..142814e1cc01e 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1099,6 +1099,7 @@
   kernel :
     func : split
   backward : split_grad
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : split_with_num
   args : (Tensor x, int num, Scalar(int) axis)
diff --git a/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
index a3f7df02e1ed7..3ed12b35d7a37 100644
--- a/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
+++ b/test/ir/pir/cinn/symbolic/test_op_infer_sym_shape.py
@@ -351,7 +351,7 @@ def test_eval_symbolic(self):
                 np.testing.assert_equal(
                     sym_shape_str_list[j].find(self.expected[i][j]),
                     0,
-                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[j]}) is not expected {(self.expected[i][j])}',
                 )
 
         return True
@@ -403,7 +403,7 @@ def test_eval_symbolic(self):
                 np.testing.assert_equal(
                     sym_shape_str_list[j].find(self.expected[i][j]),
                     0,
-                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[j]}) is not expected {(self.expected[i][j])}',
                 )
 
         return True
@@ -453,7 +453,7 @@ def test_eval_symbolic(self):
                 np.testing.assert_equal(
                     sym_shape_str_list[j].find(self.expected[i][j]),
                     0,
-                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[j]}) is not expected {(self.expected[i][j])}',
                 )
 
         return True
@@ -512,11 +512,84 @@ def test_eval_symbolic(self):
                 np.testing.assert_equal(
                     sym_shape_str_list[j].find(self.expected[i][j]),
                     0,
-                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[j]}) is not expected {(self.expected[i][j])}',
                 )
 
         return True
 
 
+class SplitNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        out = paddle.split(x, [-1], axis=1)
+        out = paddle.split(x, [1, 2, -1], axis=1)
+        out = paddle.split(x, [1, -1], axis=1)
+        out = paddle.split(x, [1, 2, 3], axis=1)
+        out = paddle.split(x, [1, 2, x.shape[1]], axis=1)
+
+        out = x.split([-1], axis=1)
+        out = x.split([1, 2, -1], axis=1)
+        out = x.split([1, -1], axis=1)
+        out = x.split([1, 2, 3], axis=1)
+        out = x.split([1, 2, x.shape[1]], axis=1)
+
+        return out
+
+
+class TestSplitOpInferSymbolicShape(TestBase):
+    def prepare_data(self):
+        self.cases = [np.random.rand(4, 6, 5)]
+
+        self.expected = [
+            [
+                'shape[S0, S1, S2], data[NULL]',
+                'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, Add(S1, -3), S2], data[NULL]',
+                'shape[S0, 1, S2], data[NULL], shape[S0, Add(S1, -1), S2], data[NULL]',
+                'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, 3, S2], data[NULL]',
+                'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, S1, S2], data[NULL]',
+                'shape[S0, S1, S2], data[NULL]',
+                'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, Add(S1, -3), S2], data[NULL]',
+                'shape[S0, 1, S2], data[NULL], shape[S0, Add(S1, -1), S2], data[NULL]',
+                'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, 3, S2], data[NULL]',
+                'shape[S0, 1, S2], data[NULL], shape[S0, 2, S2], data[NULL], shape[S0, S1, S2], data[NULL]',
+            ]
+        ]
+
+    def test_eval_symbolic(self):
+        net = SplitNet()
+
+        for i in range(len(self.cases)):
+            x = self.cases[i]
+            x_spec = InputSpec(
+                shape=[None for index in range(len(x.shape))], dtype='float32'
+            )
+
+            input_spec = [x_spec]
+            net = apply_to_static(net, False, input_spec)
+            net.eval()
+
+            # check the infer result
+            sym_shape_str_list = get_sym_shape_str_for_op(
+                net, input_spec, 'pd_op.split'
+            )
+            np.testing.assert_equal(
+                len(sym_shape_str_list), len(self.expected[i])
+            )
+            for j in range(len(sym_shape_str_list)):
+                np.testing.assert_equal(
+                    sym_shape_str_list[j].find(self.expected[i][j]),
+                    0,
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[j]}) is not expected {(self.expected[i][j])}',
+                )
+
+        # TODO(fty1777): Add builtin.split op infer symbolic shape test
+        #                Not added because attribute `sym_shape_str` does not support multi-output op now.
+        #                See also: paddle/fluid/pir/transforms/shape_optimization_pass.cc:144.
+
+        return True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
index e43d6343a94b5..dd1833aa736af 100644
--- a/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
+++ b/test/ir/pir/cinn/symbolic/test_unary_op_infer_sym_shape.py
@@ -102,7 +102,7 @@ def test_eval_symbolic(self):
                 np.testing.assert_equal(
                     sym_shape_str_list[j].find(self.expected[i][j]),
                     0,
-                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[0]}) is not expected {(self.expected[i][j])}',
+                    f'in case i,j = {i},{j}: output shape ({sym_shape_str_list[j]}) is not expected {(self.expected[i][j])}',
                 )
 
         return True

From e365fcd46c4c42ac7d6c6ff1983a770c903db63e Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Mon, 11 Mar 2024 11:10:00 +0800
Subject: [PATCH 106/114] [PIR] add paddle fatal mechanism. (#62571)

---
 paddle/common/enforce.cc                 | 11 ++++--
 paddle/common/enforce.h                  | 28 +++++++++++++++
 paddle/pir/include/core/op_info.h        |  2 +-
 paddle/pir/include/core/value.h          |  2 +-
 paddle/pir/src/core/block.cc             |  5 ++-
 paddle/pir/src/core/block_argument.cc    | 12 ++++++-
 paddle/pir/src/core/op_result_impl.cc    | 27 +++++++++++----
 paddle/pir/src/core/op_result_impl.h     |  9 ++---
 paddle/pir/src/core/operation.cc         | 20 +++++++----
 paddle/pir/src/core/value_impl.cc        | 11 +++---
 test/cpp/pir/core/CMakeLists.txt         |  1 +
 test/cpp/pir/core/block_argument_test.cc | 19 +++++++++++
 test/cpp/pir/core/ir_value_test.cc       | 27 ++++++++++++---
 test/cpp/pir/core/paddle_fatal_test.cc   | 43 ++++++++++++++++++++++++
 14 files changed, 183 insertions(+), 34 deletions(-)
 create mode 100644 test/cpp/pir/core/paddle_fatal_test.cc

diff --git a/paddle/common/enforce.cc b/paddle/common/enforce.cc
index c2ef8308e8cd9..62df5e2f2dd7d 100644
--- a/paddle/common/enforce.cc
+++ b/paddle/common/enforce.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/common/enforce.h"
 #include <array>
+#include <atomic>
 #include <map>
 #include <string>
 #include <vector>
@@ -48,13 +49,19 @@ std::string SimplifyDemangleStr(std::string str) {
   }
   return str;
 }
+
+std::atomic_bool paddle_fatal_skip{false};
+
 }  // namespace
 
 namespace common {
 namespace enforce {
-TEST_API int GetCallStackLevel() { return FLAGS_call_stack_level; }
+void SkipPaddleFatal(bool skip) { paddle_fatal_skip.store(skip); }
+bool IsPaddleFatalSkip() { return paddle_fatal_skip.load(); }
+
+int GetCallStackLevel() { return FLAGS_call_stack_level; }
 
-TEST_API std::string SimplifyErrorTypeFormat(const std::string& str) {
+std::string SimplifyErrorTypeFormat(const std::string& str) {
   std::ostringstream sout;
   size_t type_end_pos = str.find(':', 0);
   if (type_end_pos == std::string::npos) {
diff --git a/paddle/common/enforce.h b/paddle/common/enforce.h
index 856cf28d0221a..c02ec50aa0ba0 100644
--- a/paddle/common/enforce.h
+++ b/paddle/common/enforce.h
@@ -66,7 +66,24 @@ class CommonNotMetException : public std::exception {
 };
 
 namespace enforce {
+
+TEST_API void SkipPaddleFatal(bool skip = true);
+TEST_API bool IsPaddleFatalSkip();
+
 namespace details {
+
+class PaddleFatalGuard {
+ public:
+  PaddleFatalGuard() : skip_paddle_fatal_(IsPaddleFatalSkip()) {
+    if (!skip_paddle_fatal_) SkipPaddleFatal(true);
+  }
+  ~PaddleFatalGuard() {
+    if (!skip_paddle_fatal_) SkipPaddleFatal(false);
+  }
+
+ private:
+  bool skip_paddle_fatal_;
+};
 template <typename T>
 struct CanToString {
  private:
@@ -204,6 +221,8 @@ struct EnforceNotMet : public std::exception {
   // Simple error message used when no C++ stack and python compile stack
   // e.g. (InvalidArgument) ***
   std::string simple_err_str_;
+
+  details::PaddleFatalGuard paddle_fatal_guard_;
 };
 /** HELPER MACROS AND FUNCTIONS **/
 #ifndef PADDLE_MAY_THROW
@@ -266,6 +285,14 @@ using CommonType2 = typename std::add_lvalue_reference<
     END_HANDLE_THE_ERROR                                                \
   } while (0)
 
+#define PADDLE_FATAL(...)                                          \
+  if (!::common::enforce::IsPaddleFatalSkip()) {                   \
+    auto info = ::common::enforce::EnforceNotMet(                  \
+        paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
+    std::cerr << info.what() << std::endl;                         \
+    std::abort();                                                  \
+  }
+
 #define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)         \
   do {                                                                         \
     auto __val1 = (__VAL1);                                                    \
@@ -357,6 +384,7 @@ class IrNotMetException : public std::exception {
 
  private:
   std::string err_str_;
+  ::common::enforce::details::PaddleFatalGuard paddle_fatal_guard_;
 };
 
 #define IR_THROW(...)                                                     \
diff --git a/paddle/pir/include/core/op_info.h b/paddle/pir/include/core/op_info.h
index 124ed660db0f4..994aed189fc6f 100644
--- a/paddle/pir/include/core/op_info.h
+++ b/paddle/pir/include/core/op_info.h
@@ -32,7 +32,7 @@ typedef void (*VerifyPtr)(Operation *op);
 
 class IR_API OpInfo {
  public:
-  OpInfo() = default;
+  OpInfo(std::nullptr_t ptr = nullptr){};  // NOLINT
 
   OpInfo(const OpInfo &other) = default;
 
diff --git a/paddle/pir/include/core/value.h b/paddle/pir/include/core/value.h
index 0e1a2989e8f37..3a42cd539dfd2 100644
--- a/paddle/pir/include/core/value.h
+++ b/paddle/pir/include/core/value.h
@@ -32,7 +32,7 @@ class ValueImpl;
 ///
 class IR_API Value {
  public:
-  Value() = default;
+  Value(std::nullptr_t ptr = nullptr){};  // NOLINT
 
   Value(detail::ValueImpl *impl) : impl_(impl) {}  // NOLINT
 
diff --git a/paddle/pir/src/core/block.cc b/paddle/pir/src/core/block.cc
index 39b347dfe81b4..1d9021a47b47b 100644
--- a/paddle/pir/src/core/block.cc
+++ b/paddle/pir/src/core/block.cc
@@ -24,7 +24,10 @@
 namespace pir {
 Block::~Block() {
   if (!use_empty()) {
-    LOG(FATAL) << "Destroyed a block that is still in use.";
+    auto parent_op = GetParentOp();
+    PADDLE_FATAL(
+        "Destroyed a block that is still in use.. The parent op is : %s",
+        parent_op ? parent_op->name() : std::string("nullptr"));
   }
   ClearOps();
   ClearKwargs();
diff --git a/paddle/pir/src/core/block_argument.cc b/paddle/pir/src/core/block_argument.cc
index 1966aa191476a..85ed7e2fa6b77 100644
--- a/paddle/pir/src/core/block_argument.cc
+++ b/paddle/pir/src/core/block_argument.cc
@@ -75,7 +75,17 @@ class BlockArgumentImpl : public ValueImpl {
 
 BlockArgumentImpl::~BlockArgumentImpl() {
   if (!use_empty()) {
-    LOG(FATAL) << "Destroyed a block argument that is still in use.";
+    if (is_kwarg_) {
+      PADDLE_FATAL(
+          "Destroyed a keyword block argument that is still in use. The key is "
+          ": %s",
+          keyword_);
+    } else {
+      PADDLE_FATAL(
+          "Destroyed a position block argument that is still in use. The index "
+          "is : %u",
+          index_);
+    }
   }
 }
 
diff --git a/paddle/pir/src/core/op_result_impl.cc b/paddle/pir/src/core/op_result_impl.cc
index dd895cc04d10d..242bd4836efb4 100644
--- a/paddle/pir/src/core/op_result_impl.cc
+++ b/paddle/pir/src/core/op_result_impl.cc
@@ -14,6 +14,7 @@
 
 #include <glog/logging.h>
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/src/core/op_result_impl.h"
@@ -30,8 +31,9 @@ uint32_t OpResultImpl::index() const {
 
 OpResultImpl::~OpResultImpl() {
   if (!use_empty()) {
-    LOG(FATAL) << "Destroyed a op_result that is still in use. \n"
-               << "The owner op type is:" << owner()->name();
+    PADDLE_FATAL(
+        "Destroyed a op_result that is still in use. The owner op type is : %s",
+        owner()->name());
   }
 }
 
@@ -73,11 +75,12 @@ Attribute OpResultImpl::attribute(const std::string &key) const {
 void OpResultImpl::set_attribute(const std::string &key, Attribute value) {
   auto owner = this->owner();
   auto attr = owner->attribute(key);
-  if (attr && !attr.isa<ArrayAttribute>()) {
-    IR_THROW(
-        "The %s attribute has existed as operation attribute. Can't set it as "
-        "value attribute. ");
-  }
+  PADDLE_ENFORCE_EQ(attr && !attr.isa<ArrayAttribute>(),
+                    false,
+                    common::errors::PreconditionNotMet(
+                        "The %s attribute has existed as operation attribute. "
+                        "Can't set it as value attribute. ",
+                        key));
   auto array_attr = attr.dyn_cast<ArrayAttribute>();
   auto index = this->index();
   std::vector<Attribute> vec;
@@ -87,5 +90,15 @@ void OpResultImpl::set_attribute(const std::string &key, Attribute value) {
   owner->set_attribute(key, ArrayAttribute::get(owner->ir_context(), vec));
 }
 
+OpInlineResultImpl::OpInlineResultImpl(Type type, uint32_t result_index)
+    : OpResultImpl(type, result_index) {
+  PADDLE_ENFORCE_LE(
+      result_index,
+      MAX_INLINE_RESULT_IDX,
+      common::errors::PreconditionNotMet(
+          "Inline result index [%u] should not exceed MaxInlineResultIndex(5)",
+          result_index));
+}
+
 }  // namespace detail
 }  // namespace pir
diff --git a/paddle/pir/src/core/op_result_impl.h b/paddle/pir/src/core/op_result_impl.h
index b50b2dd94a258..3671feef03fa9 100644
--- a/paddle/pir/src/core/op_result_impl.h
+++ b/paddle/pir/src/core/op_result_impl.h
@@ -42,7 +42,7 @@ class OpResultImpl : public ValueImpl {
   ///
   uint32_t index() const;
 
-  ~OpResultImpl();
+  TEST_API ~OpResultImpl();
 
   ///
   /// \brief attribute related public interfaces
@@ -60,12 +60,7 @@ class OpResultImpl : public ValueImpl {
 ///
 class OpInlineResultImpl : public OpResultImpl {
  public:
-  OpInlineResultImpl(Type type, uint32_t result_index)
-      : OpResultImpl(type, result_index) {
-    if (result_index > MAX_INLINE_RESULT_IDX) {
-      throw("Inline result index should not exceed MaxInlineResultIndex(5)");
-    }
-  }
+  TEST_API OpInlineResultImpl(Type type, uint32_t result_index);
 
   static bool classof(const ValueImpl &value) {
     return value.kind() < OUTLINE_RESULT_IDX;
diff --git a/paddle/pir/src/core/operation.cc b/paddle/pir/src/core/operation.cc
index 923316c765245..d4bf453bef162 100644
--- a/paddle/pir/src/core/operation.cc
+++ b/paddle/pir/src/core/operation.cc
@@ -372,9 +372,13 @@ void Operation::Verify() {
 }
 
 int32_t Operation::ComputeOpResultOffset(uint32_t index) const {
-  if (index >= num_results_) {
-    LOG(FATAL) << "index exceeds OP op result range.";
-  }
+  PADDLE_ENFORCE_LT(
+      index,
+      num_results_,
+      common::errors::PreconditionNotMet(
+          "The op result index [%u] must less than results size[%u].",
+          index,
+          num_results_));
   if (index < OUTLINE_RESULT_IDX) {
     return -static_cast<int32_t>((index + 1u) * sizeof(OpInlineResultImpl));
   }
@@ -384,9 +388,13 @@ int32_t Operation::ComputeOpResultOffset(uint32_t index) const {
 }
 
 int32_t Operation::ComputeOpOperandOffset(uint32_t index) const {
-  if (index >= num_operands_) {
-    LOG(FATAL) << "index exceeds OP op operand range.";
-  }
+  PADDLE_ENFORCE_LT(
+      index,
+      num_operands_,
+      common::errors::PreconditionNotMet(
+          "The op operand index [%u] must less than operands size[%u].",
+          index,
+          num_operands_));
   return static_cast<int32_t>(index * sizeof(OpOperandImpl) +
                               sizeof(Operation));
 }
diff --git a/paddle/pir/src/core/value_impl.cc b/paddle/pir/src/core/value_impl.cc
index 5b37e24e8240d..b5b41374497cc 100644
--- a/paddle/pir/src/core/value_impl.cc
+++ b/paddle/pir/src/core/value_impl.cc
@@ -14,6 +14,7 @@
 
 #include <glog/logging.h>
 
+#include "paddle/common/enforce.h"
 #include "paddle/pir/src/core/value_impl.h"
 
 namespace {
@@ -50,10 +51,12 @@ std::string ValueImpl::PrintUdChain() {
   return result.str();
 }
 ValueImpl::ValueImpl(Type type, uint32_t kind) : id_(GenerateId()) {
-  if (kind > BLOCK_ARG_IDX) {
-    LOG(FATAL) << "The kind of value_impl(" << kind
-               << "), is bigger than BLOCK_ARG_IDX(7)";
-  }
+  PADDLE_ENFORCE_LE(
+      kind,
+      BLOCK_ARG_IDX,
+      common::errors::PreconditionNotMet(
+          "The kind of value_impl[%u] must not bigger than BLOCK_ARG_IDX(7)",
+          kind));
   type_ = type;
   first_use_offseted_by_kind_ = reinterpret_cast<OpOperandImpl *>(
       reinterpret_cast<uintptr_t>(nullptr) + kind);
diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
index 8aeea39d6e6e2..0bb1c1b708ae0 100644
--- a/test/cpp/pir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -8,6 +8,7 @@ paddle_test(ir_program_test SRCS ir_program_test.cc)
 paddle_test(ir_infershape_test SRCS ir_infershape_test.cc)
 paddle_test(scalar_attribute_test SRCS scalar_attribute_test.cc)
 paddle_test(ir_printer_test SRCS ir_printer_test.cc DEPS test_dialect)
+paddle_test(paddle_fatal_test SRCS paddle_fatal_test.cc)
 
 file(
   DOWNLOAD https://paddle-ci.gz.bcebos.com/ir_translator_test/resnet50_main.prog
diff --git a/test/cpp/pir/core/block_argument_test.cc b/test/cpp/pir/core/block_argument_test.cc
index c9fb0ca9e8cc4..32f57e8f5fd1b 100644
--- a/test/cpp/pir/core/block_argument_test.cc
+++ b/test/cpp/pir/core/block_argument_test.cc
@@ -103,3 +103,22 @@ TEST(block_argument_test, kwargs) {
   EXPECT_EQ(block->kwargs_size(), 4u);
   EXPECT_EQ(value.type(), builder.bool_type());
 }
+
+TEST(block_argument_test, fatal) {
+  auto block = new pir::Block();
+  auto arg = block->AddArg(nullptr);
+  auto op = pir::Operation::Create({arg}, {}, {}, nullptr);
+  EXPECT_DEATH(delete block,
+               "Destroyed a position block argument that is still in use.*");
+  auto kwarg = block->AddKwarg("a", nullptr);
+  arg.ReplaceAllUsesWith(kwarg);
+  block->ClearArgs();
+  EXPECT_DEATH(delete block,
+               "Destroyed a keyword block argument that is still in use.*");
+
+  op->Destroy();
+  op = pir::Operation::Create({}, {}, {}, nullptr, 0, {block});
+  EXPECT_DEATH(delete block, "Destroyed a block that is still in use.*");
+  op->Destroy();
+  delete block;
+}
diff --git a/test/cpp/pir/core/ir_value_test.cc b/test/cpp/pir/core/ir_value_test.cc
index d377d9c701fec..e8e1f3a26c851 100644
--- a/test/cpp/pir/core/ir_value_test.cc
+++ b/test/cpp/pir/core/ir_value_test.cc
@@ -21,6 +21,7 @@
 #include "paddle/pir/include/core/ir_context.h"
 #include "paddle/pir/include/core/operation.h"
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
+#include "paddle/pir/src/core/op_result_impl.h"
 
 // This unittest is used to test the construction interfaces of value class and
 // operation. The constructed test scenario is: a = OP1(); b = OP2(); c = OP3(a,
@@ -50,7 +51,7 @@ TEST(value_test, value_test) {
       op1_inputs,
       test::CreateAttributeMap({"op1_name"}, {"op1_attr"}),
       op1_output_types,
-      pir::OpInfo());
+      nullptr);
   op1->Print(std::cout);
   pir::Value a = op1->result(0);
   EXPECT_TRUE(a.use_empty());
@@ -61,7 +62,7 @@ TEST(value_test, value_test) {
       op2_inputs,
       test::CreateAttributeMap({"op2_name"}, {"op2_attr"}),
       op2_output_types,
-      pir::OpInfo());
+      nullptr);
   op2->Print(std::cout);
   pir::Value b = op2->result(0);
   EXPECT_TRUE(b.use_empty());
@@ -72,7 +73,7 @@ TEST(value_test, value_test) {
       op3_inputs,
       test::CreateAttributeMap({"op3_name"}, {"op3_attr"}),
       op3_output_types,
-      pir::OpInfo());
+      nullptr);
 
   EXPECT_TRUE(op1->result(0).HasOneUse());
   EXPECT_TRUE(op2->result(0).HasOneUse());
@@ -88,7 +89,7 @@ TEST(value_test, value_test) {
       op4_inputs,
       test::CreateAttributeMap({"op4_name"}, {"op4_attr"}),
       op4_output_types,
-      pir::OpInfo());
+      nullptr);
   op4->Print(std::cout);
 
   // Test 1:
@@ -135,3 +136,21 @@ TEST(value_test, value_test) {
   VLOG(0) << op1->result(0).PrintUdChain() << std::endl;
   op1->Destroy();
 }
+
+TEST(op_result_test, exception) {
+  EXPECT_THROW(
+      pir::detail::OpInlineResultImpl(nullptr, MAX_INLINE_RESULT_IDX + 1),
+      common::enforce::EnforceNotMet);
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  auto op = pir::Operation::Create(
+      {}, {{"test", pir::Int32Attribute::get(ctx, 1)}}, {nullptr}, nullptr);
+  auto result = op->result(0);
+  auto op2 = pir::Operation::Create({result}, {}, {}, nullptr);
+  EXPECT_DEATH(op->Destroy(), "Destroyed a op_result that is still in use.*");
+  EXPECT_THROW(result.set_attribute("test", nullptr),
+               common::enforce::EnforceNotMet);
+  EXPECT_THROW(op->result(1), common::enforce::EnforceNotMet);
+  EXPECT_THROW(op->operand(1), common::enforce::EnforceNotMet);
+  op2->Destroy();
+  op->Destroy();
+}
diff --git a/test/cpp/pir/core/paddle_fatal_test.cc b/test/cpp/pir/core/paddle_fatal_test.cc
new file mode 100644
index 0000000000000..f31981e18dc50
--- /dev/null
+++ b/test/cpp/pir/core/paddle_fatal_test.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/common/enforce.h"
+#include "paddle/phi/core/enforce.h"
+
+class FatalClass {
+ public:
+  FatalClass() {}
+  ~FatalClass() { PADDLE_FATAL("fatal occured in deconstructor!"); }
+};
+
+void throw_exception_in_func() {
+  FatalClass test_case;
+  PADDLE_THROW(::common::errors::External("throw excption in func"));
+}
+
+void terminate_in_func() { FatalClass test_case; }
+
+TEST(paddle_fatal_test, base) {
+  EXPECT_FALSE(::common::enforce::IsPaddleFatalSkip());
+  EXPECT_DEATH(terminate_in_func(), "fatal occured in deconstructor!.*");
+  EXPECT_THROW(throw_exception_in_func(), common::enforce::EnforceNotMet);
+  EXPECT_FALSE(::common::enforce::IsPaddleFatalSkip());
+  ::common::enforce::SkipPaddleFatal(true);
+  // skip fatal.
+  terminate_in_func();
+  // unskip paddle fatal.
+  ::common::enforce::SkipPaddleFatal(false);
+}

From 0417a595d12fa037418f934cca9085581c0a65d7 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 11 Mar 2024 11:22:37 +0800
Subject: [PATCH 107/114] Fix DEFIN_NOT definite_not (#62548)

* Fix

* Fix
---
 paddle/fluid/framework/op_compatible_info.cc  | 62 ++++++++++---------
 paddle/fluid/framework/op_compatible_info.h   |  2 +-
 .../framework/op_compatible_info_test.cc      |  6 +-
 3 files changed, 38 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc
index ba71043771ff2..4ac6080730d09 100644
--- a/paddle/fluid/framework/op_compatible_info.cc
+++ b/paddle/fluid/framework/op_compatible_info.cc
@@ -68,42 +68,48 @@ inline bool CompareVersion(const std::string& str_first,
 }
 
 void OpCompatibleMap::InitOpCompatibleMap() {
-  op_compatible_map_["sequence_pad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["sequence_unpad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
+  op_compatible_map_["sequence_pad"] = {"1.6.0",
+                                        OpCompatibleType::definite_not};
+  op_compatible_map_["sequence_unpad"] = {"1.6.0",
+                                          OpCompatibleType::definite_not};
 
   op_compatible_map_["coalesce_tensor"] = {"1.6.0",
-                                           OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["crop_tensor"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
+                                           OpCompatibleType::definite_not};
+  op_compatible_map_["crop_tensor"] = {"1.6.0", OpCompatibleType::definite_not};
   op_compatible_map_["deformable_conv"] = {"1.6.0",
-                                           OpCompatibleType::DEFIN_NOT};
+                                           OpCompatibleType::definite_not};
   op_compatible_map_["deformable_conv_v1"] = {"1.6.0",
-                                              OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["dpsgd"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["eye"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["fill_any_like"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["hard_swish"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["gather_nd"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["instance_norm"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
+                                              OpCompatibleType::definite_not};
+  op_compatible_map_["dpsgd"] = {"1.6.0", OpCompatibleType::definite_not};
+  op_compatible_map_["eye"] = {"1.6.0", OpCompatibleType::definite_not};
+  op_compatible_map_["fill_any_like"] = {"1.6.0",
+                                         OpCompatibleType::definite_not};
+  op_compatible_map_["hard_swish"] = {"1.6.0", OpCompatibleType::definite_not};
+  op_compatible_map_["gather_nd"] = {"1.6.0", OpCompatibleType::definite_not};
+  op_compatible_map_["instance_norm"] = {"1.6.0",
+                                         OpCompatibleType::definite_not};
   op_compatible_map_["lookup_table_v2"] = {"1.6.0",
-                                           OpCompatibleType::DEFIN_NOT};
+                                           OpCompatibleType::definite_not};
   op_compatible_map_["match_matrix_tensor"] = {"1.6.0",
-                                               OpCompatibleType::DEFIN_NOT};
+                                               OpCompatibleType::definite_not};
   op_compatible_map_["multiclass_nms2"] = {"1.6.0",
-                                           OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["one_hot_v2"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
+                                           OpCompatibleType::definite_not};
+  op_compatible_map_["one_hot_v2"] = {"1.6.0", OpCompatibleType::definite_not};
   op_compatible_map_["pull_box_sparse"] = {"1.6.0",
-                                           OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["scatter_nd_add"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["shard_index"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["size"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["strided_slice"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
+                                           OpCompatibleType::definite_not};
+  op_compatible_map_["scatter_nd_add"] = {"1.6.0",
+                                          OpCompatibleType::definite_not};
+  op_compatible_map_["shard_index"] = {"1.6.0", OpCompatibleType::definite_not};
+  op_compatible_map_["size"] = {"1.6.0", OpCompatibleType::definite_not};
+  op_compatible_map_["strided_slice"] = {"1.6.0",
+                                         OpCompatibleType::definite_not};
   op_compatible_map_["trilinear_interp"] = {"1.6.0",
-                                            OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["unfold"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["unique"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
+                                            OpCompatibleType::definite_not};
+  op_compatible_map_["unfold"] = {"1.6.0", OpCompatibleType::definite_not};
+  op_compatible_map_["unique"] = {"1.6.0", OpCompatibleType::definite_not};
   op_compatible_map_["unique_with_counts"] = {"1.6.0",
-                                              OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["var_conv_2d"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
+                                              OpCompatibleType::definite_not};
+  op_compatible_map_["var_conv_2d"] = {"1.6.0", OpCompatibleType::definite_not};
 
   op_compatible_map_["reshape2"] = {"1.6.0", OpCompatibleType::possible};
   op_compatible_map_["slice"] = {"1.6.0", OpCompatibleType::possible};
@@ -156,7 +162,7 @@ CompatibleInfo OpCompatibleMap::GetOpCompatibleInfo(std::string op_name) const {
   if (it != op_compatible_map_.end()) {
     return it->second;
   } else {
-    return {default_required_version_, OpCompatibleType::DEFIN_NOT};
+    return {default_required_version_, OpCompatibleType::definite_not};
   }
 }
 
@@ -174,7 +180,7 @@ OpCompatibleType OpCompatibleMap::IsRequireMiniVersion(
     if (CompareVersion(str_current_version, default_required_version_)) {
       return OpCompatibleType::compatible;
     } else {
-      return OpCompatibleType::DEFIN_NOT;
+      return OpCompatibleType::definite_not;
     }
   }
 }
diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h
index 6f86b8b64ed21..7256a92b5b457 100644
--- a/paddle/fluid/framework/op_compatible_info.h
+++ b/paddle/fluid/framework/op_compatible_info.h
@@ -28,7 +28,7 @@ class OpCompatibleMap;
 
 enum class OpCompatibleType {
   compatible = 0,       //   support previous version
-  DEFIN_NOT = 1,        //   definitely can't support previous version
+  definite_not = 1,     //   definitely can't support previous version
   possible = 2,         //   possible can support previous version, not sure
   bug_fix = 3,          //   bug fix, can't support previous version
   precision_change = 4  //   precision change, may cause difference
diff --git a/test/cpp/fluid/framework/op_compatible_info_test.cc b/test/cpp/fluid/framework/op_compatible_info_test.cc
index a75b2c0ee9423..63bad5c25f73d 100644
--- a/test/cpp/fluid/framework/op_compatible_info_test.cc
+++ b/test/cpp/fluid/framework/op_compatible_info_test.cc
@@ -37,7 +37,7 @@ TEST(test_op_compatible_info, test_op_compatible) {
             std::string());
 
   auto comp_1 = comp_map.IsRequireMiniVersion("sequence_pad", "1.5.0");
-  ASSERT_EQ(comp_1, OpCompatibleType::DEFIN_NOT);
+  ASSERT_EQ(comp_1, OpCompatibleType::definite_not);
   auto comp_2 = comp_map.IsRequireMiniVersion("sequence_pad", "1.6.0");
   ASSERT_EQ(comp_2, OpCompatibleType::compatible);
   auto comp_3 = comp_map.IsRequireMiniVersion("sequence_pad", "1.6.1");
@@ -45,14 +45,14 @@ TEST(test_op_compatible_info, test_op_compatible) {
   auto comp_6 = comp_map.IsRequireMiniVersion("sequence_pad", "1.7.0");
   ASSERT_EQ(comp_6, OpCompatibleType::compatible);
   auto comp_7 = comp_map.IsRequireMiniVersion("sequence_pad", "0.7.0");
-  ASSERT_EQ(comp_7, OpCompatibleType::DEFIN_NOT);
+  ASSERT_EQ(comp_7, OpCompatibleType::definite_not);
   auto comp_8 = comp_map.IsRequireMiniVersion("sequence_pad", "2.0.0");
   ASSERT_EQ(comp_8, OpCompatibleType::compatible);
 
   ASSERT_EQ(comp_map.IsRequireMiniVersion("unkop", "2.0.0"),
             OpCompatibleType::compatible);
   ASSERT_EQ(comp_map.IsRequireMiniVersion("unkop", "0.7.0"),
-            OpCompatibleType::DEFIN_NOT);
+            OpCompatibleType::definite_not);
 
   ASSERT_EQ(comp_map.IsRequireMiniVersion("slice", "0.7.0"),
             OpCompatibleType::possible);

From c00cd0cedb2d055f4b28f9662aefb9ef2a0ce874 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Mon, 11 Mar 2024 11:24:02 +0800
Subject: [PATCH 108/114] [PIR]Fix Bugs and adapt Custom op unittest (#62506)

* fix custom op

* fix compile bugs

* fix inplace infershape bugs
---
 .../fluid/framework/custom_operator_utils.h   | 191 ++++++++++++---
 .../instruction/custom_kernel_instruction.cc  |   1 -
 .../pir/dialect/operator/ir/op_dialect.cc     | 148 ++++++++----
 .../fluid/pir/dialect/operator/utils/utils.cc | 218 +++++++++---------
 .../fluid/pybind/manual_static_op_function.h  |  57 +++--
 test/custom_op/test_custom_cast_op_jit.py     |  15 +-
 test/custom_op/test_custom_concat.py          |  14 +-
 test/custom_op/test_custom_conj.py            |  10 +-
 test/custom_op/test_custom_inplace.py         | 156 ++++++++++---
 test/custom_op/test_custom_linear.py          |  33 ++-
 test/custom_op/test_custom_optional.py        | 128 +++++++---
 test/custom_op/test_custom_tensor_operator.py |  48 ++--
 test/custom_op/test_multi_out_jit.py          |  34 ++-
 13 files changed, 754 insertions(+), 299 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator_utils.h b/paddle/fluid/framework/custom_operator_utils.h
index 31b0793c8fb6a..a9fed3ccca2eb 100644
--- a/paddle/fluid/framework/custom_operator_utils.h
+++ b/paddle/fluid/framework/custom_operator_utils.h
@@ -24,6 +24,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 constexpr char kCustomDialectPrefix[] = "custom_op.";  // NOLINT
+constexpr char kGradSuffix[] = "_grad";                // NOLINT
+constexpr char kDoubleGradSuffix[] = "_grad_grad";     // NOLINT
+
 namespace detail {
 
 // dynamic lib load func
@@ -93,10 +96,10 @@ inline static const OpMetaInfo* GetGradOpInfoByFwdPirName(
   }
 
   pos = custom_name.length();
-  if (custom_name.find("_grad_grad") != custom_name.npos) {
-    pos = custom_name.find("_grad_grad");
-  } else if (custom_name.find("_grad") != custom_name.npos) {
-    pos = custom_name.find("_grad");
+  if (custom_name.find(kDoubleGradSuffix) != custom_name.npos) {
+    pos = custom_name.find(kDoubleGradSuffix);
+  } else if (custom_name.find(kGradSuffix) != custom_name.npos) {
+    pos = custom_name.find(kGradSuffix);
   }
   auto custom_name_prefix = custom_name.substr(0, pos);
   auto map_iter =
@@ -106,10 +109,10 @@ inline static const OpMetaInfo* GetGradOpInfoByFwdPirName(
   }
   const auto& vec_op_meta = map_iter->second;
   const OpMetaInfo* ret = nullptr;
-  if (custom_name.find("_grad_grad") != custom_name.npos) {
+  if (custom_name.find(kDoubleGradSuffix) != custom_name.npos) {
     PADDLE_THROW("Custom op : " + custom_name_prefix +
                  " doesn't support triple grad.");
-  } else if (custom_name.find("_grad") != custom_name.npos) {
+  } else if (custom_name.find(kGradSuffix) != custom_name.npos) {
     bool has_double_grad = vec_op_meta.size() >= 3;
     ret = has_double_grad ? &(vec_op_meta[2]) : nullptr;
   } else {
@@ -130,10 +133,10 @@ inline static const OpMetaInfo& GetOpInfoByPirName(
   }
 
   pos = custom_name.length();
-  if (custom_name.find("_grad_grad") != custom_name.npos) {
-    pos = custom_name.find("_grad_grad");
-  } else if (custom_name.find("_grad") != custom_name.npos) {
-    pos = custom_name.find("_grad");
+  if (custom_name.find(kDoubleGradSuffix) != custom_name.npos) {
+    pos = custom_name.find(kDoubleGradSuffix);
+  } else if (custom_name.find(kGradSuffix) != custom_name.npos) {
+    pos = custom_name.find(kGradSuffix);
   }
   auto custom_name_prefix = custom_name.substr(0, pos);
   auto map_iter =
@@ -142,9 +145,9 @@ inline static const OpMetaInfo& GetOpInfoByPirName(
     PADDLE_THROW("The info of custom op : " + custom_name + " is not exists!");
   }
   const auto& vec_op_meta = map_iter->second;
-  if (custom_name.find("_grad_grad") != custom_name.npos) {
+  if (custom_name.find(kDoubleGradSuffix) != custom_name.npos) {
     return vec_op_meta[2];
-  } else if (custom_name.find("_grad") != custom_name.npos) {
+  } else if (custom_name.find(kGradSuffix) != custom_name.npos) {
     return vec_op_meta[1];
   } else {
     return vec_op_meta[0];
@@ -161,10 +164,10 @@ inline static bool HasGradOp(const std::string& fwd_pir_op_name) {
   }
 
   pos = custom_name.length();
-  if (custom_name.find("_grad_grad") != custom_name.npos) {
-    pos = custom_name.find("_grad_grad");
-  } else if (custom_name.find("_grad") != custom_name.npos) {
-    pos = custom_name.find("_grad");
+  if (custom_name.find(kDoubleGradSuffix) != custom_name.npos) {
+    pos = custom_name.find(kDoubleGradSuffix);
+  } else if (custom_name.find(kGradSuffix) != custom_name.npos) {
+    pos = custom_name.find(kGradSuffix);
   }
   auto custom_name_prefix = custom_name.substr(0, pos);
   auto map_iter =
@@ -174,10 +177,10 @@ inline static bool HasGradOp(const std::string& fwd_pir_op_name) {
                  " is not exists!");
   }
   const auto& vec_op_meta = map_iter->second;
-  if (custom_name.find("_grad_grad") != custom_name.npos) {
+  if (custom_name.find(kDoubleGradSuffix) != custom_name.npos) {
     // custom op only support double grad, there will not have triple grad op
     return false;
-  } else if (custom_name.find("_grad") != custom_name.npos) {
+  } else if (custom_name.find(kGradSuffix) != custom_name.npos) {
     // vec_op_meta.size() == 3 means the op has double grad op
     return vec_op_meta.size() > 2UL;
   } else {
@@ -247,7 +250,8 @@ static std::vector<std::vector<int64_t>> RunDefaultInferShape(
     const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
     const std::unordered_map<std::string, int>& vec_input_name2id_map) {
   std::vector<std::vector<int64_t>> output_shapes;
-  auto& inplace_map = OpMetaInfoHelper::GetInplaceMap(custom_op_meta);
+  auto& inplace_reverse_map =
+      OpMetaInfoHelper::GetInplaceReverseMap(custom_op_meta);
   // Op is grad op
   if (custom_op_meta.IsGradOp() || custom_op_meta.IsDoubleGradOp()) {
     bool is_double_grad = custom_op_meta.IsDoubleGradOp();
@@ -278,6 +282,10 @@ static std::vector<std::vector<int64_t>> RunDefaultInferShape(
                       bwd_input_name) != bwd_inputs_name.end()) {
           int input_index = input_name2id_map.at(bwd_input_name);
           auto input_shape = input_shapes[input_index];
+          if (input_shape.size() == 0) {
+            // if optional tensor is None, we don't need to infer shape
+            continue;
+          }
           output_shapes.push_back(input_shape);
         } else {
           PADDLE_ENFORCE_EQ(
@@ -299,7 +307,8 @@ static std::vector<std::vector<int64_t>> RunDefaultInferShape(
   }
 
   // Op is forward op
-  if (inplace_map.empty()) {  // general case, assure single input and output
+  if (inplace_reverse_map
+          .empty()) {  // general case, assure single input and output
     VLOG(3) << "Custom Operator: Default InferShape - share ddim.";
     if (input_shapes.size() == 1) {
       output_shapes = input_shapes;
@@ -311,15 +320,21 @@ static std::vector<std::vector<int64_t>> RunDefaultInferShape(
           "and only one output without setting the InferShapeFn. "));
     }
   } else {  // inplace case
-    for (auto const& pair : inplace_map) {
-      if (paddle::framework::detail::IsDuplicableVar(pair.second)) {
-        int input_index = vec_input_name2id_map.at(pair.first);
+    const auto& outputs = paddle::OpMetaInfoHelper::GetOutputs(custom_op_meta);
+    for (auto& output : outputs) {
+      auto input_name = inplace_reverse_map.at(output);
+      if (paddle::framework::detail::IsDuplicableVar(output)) {
+        int input_index = vec_input_name2id_map.at(input_name);
         auto input_shape = vec_input_shapes[input_index];
         output_shapes.insert(
             output_shapes.end(), input_shape.begin(), input_shape.end());
       } else {
-        int input_index = input_name2id_map.at(pair.first);
+        int input_index = input_name2id_map.at(input_name);
         auto input_shape = input_shapes[input_index];
+        if (input_shape.size() == 0) {
+          // if optional tensor is None, we don't need to infer shape
+          continue;
+        }
         output_shapes.push_back(input_shape);
       }
     }
@@ -334,7 +349,8 @@ static std::vector<DataType> RunDefaultInferDtype(
     const std::vector<std::vector<DataType>>& vec_input_dtypes,
     const std::unordered_map<std::string, int>& vec_input_name2id_map) {
   std::vector<DataType> output_dtypes;
-  auto& inplace_map = OpMetaInfoHelper::GetInplaceMap(custom_op_meta);
+  auto& inplace_reverse_map =
+      OpMetaInfoHelper::GetInplaceReverseMap(custom_op_meta);
   // Op is grad op
   if (custom_op_meta.IsGradOp() || custom_op_meta.IsDoubleGradOp()) {
     bool is_double_grad = custom_op_meta.IsDoubleGradOp();
@@ -357,6 +373,10 @@ static std::vector<DataType> RunDefaultInferDtype(
                       bwd_input_name) != bwd_inputs_name.end()) {
           int input_index = input_name2id_map.at(bwd_input_name);
           auto input_dtype = input_dtypes[input_index];
+          if (input_dtype == DataType::UNDEFINED) {
+            // if optional tensor is None, we don't need to infer dtype
+            continue;
+          }
           output_dtypes.push_back(input_dtype);
         } else {
           // If there is no corresponding input for the output, set float as
@@ -368,7 +388,8 @@ static std::vector<DataType> RunDefaultInferDtype(
     return output_dtypes;
   }
 
-  if (inplace_map.empty()) {  // general case, assure single input and output
+  if (inplace_reverse_map
+          .empty()) {  // general case, assure single input and output
     VLOG(3) << "Custom Operator: Default InferDtype - share ddim.";
     if (input_dtypes.size() == 1) {
       output_dtypes = input_dtypes;
@@ -380,15 +401,21 @@ static std::vector<DataType> RunDefaultInferDtype(
           "and only one output without setting the InferDtypeFn. "));
     }
   } else {  // inplace case
-    for (auto const& pair : inplace_map) {
-      if (paddle::framework::detail::IsDuplicableVar(pair.second)) {
-        int input_index = vec_input_name2id_map.at(pair.first);
+    const auto& outputs = paddle::OpMetaInfoHelper::GetOutputs(custom_op_meta);
+    for (auto& output : outputs) {
+      auto input_name = inplace_reverse_map.at(output);
+      if (paddle::framework::detail::IsDuplicableVar(output)) {
+        int input_index = vec_input_name2id_map.at(input_name);
         auto input_dtype = vec_input_dtypes[input_index];
         output_dtypes.insert(
             output_dtypes.end(), input_dtype.begin(), input_dtype.end());
       } else {
-        int input_index = input_name2id_map.at(pair.first);
+        int input_index = input_name2id_map.at(input_name);
         auto input_dtype = input_dtypes[input_index];
+        if (input_dtype == DataType::UNDEFINED) {
+          // if optional tensor is None, we don't need to infer dtype
+          continue;
+        }
         output_dtypes.push_back(input_dtype);
       }
     }
@@ -405,7 +432,57 @@ static std::vector<std::vector<int64_t>> RunInferShape(
     const std::unordered_map<std::string, int>& vec_input_name2id_map,
     const std::vector<paddle::any>& custom_attrs) {
   if (infershape_func) {
-    return infershape_func(input_shapes, vec_input_shapes, custom_attrs);
+    std::vector<std::vector<int64_t>> infershape_result =
+        infershape_func(input_shapes, vec_input_shapes, custom_attrs);
+    std::vector<std::vector<int64_t>> complete_result;
+    const auto& outputs = paddle::OpMetaInfoHelper::GetOutputs(custom_op_meta);
+    const auto& inplace_reverse_map =
+        paddle::OpMetaInfoHelper::GetInplaceReverseMap(custom_op_meta);
+
+    // The real output shape result is ( infershape func result + inplace output
+    // result), because the infershape doesn't create output shape that belongs
+    // to inplace output.
+    size_t infershape_result_index = 0;
+    for (auto& out_name : outputs) {
+      if (paddle::framework::detail::IsDuplicableVar(out_name)) {
+        PADDLE_ENFORCE(
+            inplace_reverse_map.find(out_name) != inplace_reverse_map.end(),
+            phi::errors::InvalidArgument(
+                "Custom operator only supports `paddle::Vec(...)` inputs and "
+                "cannot support `paddle::Vec(...)` output without setting "
+                "InplaceMap. If you have to use `paddle::Vec(...)` output, "
+                "please indicate it by setting InplaceMap manually."));
+        auto in_name = inplace_reverse_map.at(out_name);
+        if (custom_op_meta.IsGradOp() || custom_op_meta.IsDoubleGradOp()) {
+          const auto& bwd_op_name =
+              paddle::OpMetaInfoHelper::GetOpName(custom_op_meta);
+          bool is_double_grad_op =
+              (bwd_op_name.find(kDoubleGradSuffix) != bwd_op_name.npos) ? true
+                                                                        : false;
+          in_name =
+              paddle::framework::detail::NoGrad(out_name, is_double_grad_op);
+        }
+        auto index = vec_input_name2id_map.at(in_name);
+        const auto& vec_input_shape = vec_input_shapes[index];
+        complete_result.insert(complete_result.end(),
+                               vec_input_shape.begin(),
+                               vec_input_shape.end());
+      } else {
+        if (inplace_reverse_map.find(out_name) != inplace_reverse_map.end()) {
+          auto in_name = inplace_reverse_map.at(out_name);
+          auto index = input_name2id_map.at(in_name);
+          if (input_shapes[index].size() == 0) {
+            // if optional tensor is None, we don't need to infer shape，
+            continue;
+          }
+          complete_result.push_back(input_shapes[index]);
+        } else {
+          complete_result.push_back(infershape_result[infershape_result_index]);
+          infershape_result_index++;
+        }
+      }
+    }
+    return complete_result;
   } else {
     return RunDefaultInferShape(custom_op_meta,
                                 input_shapes,
@@ -424,7 +501,57 @@ static std::vector<DataType> RunInferDtype(
     const std::unordered_map<std::string, int>& vec_input_name2id_map,
     const std::vector<paddle::any>& custom_attrs) {
   if (inferdtype_func) {
-    return inferdtype_func(input_dtypes, vec_input_dtypes, custom_attrs);
+    std::vector<DataType> complete_result;
+    const auto& outputs = paddle::OpMetaInfoHelper::GetOutputs(custom_op_meta);
+    const auto& inplace_reverse_map =
+        paddle::OpMetaInfoHelper::GetInplaceReverseMap(custom_op_meta);
+    std::vector<DataType> inferdtype_result =
+        inferdtype_func(input_dtypes, vec_input_dtypes, custom_attrs);
+
+    // The real output dtype result is ( infershape func dtype + inplace output
+    // dtype), because the inferdtype doesn't create output dtype that belongs
+    // to inplace output.
+    size_t inferdtype_result_index = 0;
+    for (auto& out_name : outputs) {
+      if (paddle::framework::detail::IsDuplicableVar(out_name)) {
+        PADDLE_ENFORCE(
+            inplace_reverse_map.find(out_name) != inplace_reverse_map.end(),
+            phi::errors::InvalidArgument(
+                "Custom operator only supports `paddle::Vec(...)` inputs and "
+                "cannot support `paddle::Vec(...)` output without setting "
+                "InplaceMap. If you have to use `paddle::Vec(...)` output, "
+                "please indicate it by setting InplaceMap manually."));
+        auto in_name = inplace_reverse_map.at(out_name);
+        if (custom_op_meta.IsGradOp() || custom_op_meta.IsDoubleGradOp()) {
+          const auto& bwd_op_name =
+              paddle::OpMetaInfoHelper::GetOpName(custom_op_meta);
+          bool is_double_grad_op =
+              (bwd_op_name.find(kDoubleGradSuffix) != bwd_op_name.npos) ? true
+                                                                        : false;
+          in_name =
+              paddle::framework::detail::NoGrad(out_name, is_double_grad_op);
+        }
+        auto index = vec_input_name2id_map.at(in_name);
+        const auto& vec_input_dtype = vec_input_dtypes[index];
+        complete_result.insert(complete_result.end(),
+                               vec_input_dtype.begin(),
+                               vec_input_dtype.end());
+      } else {
+        if (inplace_reverse_map.find(out_name) != inplace_reverse_map.end()) {
+          auto in_name = inplace_reverse_map.at(out_name);
+          auto index = input_name2id_map.at(in_name);
+          if (input_dtypes[index] == DataType::UNDEFINED) {
+            // if optional tensor is None, we don't need to infer dtype
+            continue;
+          }
+          complete_result.push_back(input_dtypes[index]);
+        } else {
+          complete_result.push_back(inferdtype_result[inferdtype_result_index]);
+          inferdtype_result_index++;
+        }
+      }
+    }
+    return complete_result;
   } else {
     return RunDefaultInferDtype(custom_op_meta,
                                 input_dtypes,
diff --git a/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
index 683d1bd95dcb8..b8a2b676e8ed5 100644
--- a/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/custom_kernel_instruction.cc
@@ -280,7 +280,6 @@ void CustomKernelInstruction::BuildCustomContext(
               out_name));
       VLOG(3) << "Custom Operator: BuildContext - inplace optional outputs : "
               << out_name << " is None.";
-      cache_out_ptrs_.emplace_back(nullptr);
       custom_kernel_ctx_.EmplaceBackOutput(std::move(paddle::Tensor()));
 
       VLOG(8) << "ctx->EmplaceBackOutput : an optional output";
diff --git a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 1364c1e1e0c77..4a3da52f953c0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -466,8 +466,10 @@ struct CustomOpInfoInterfaceModel : public OpYamlInfoInterface::Concept {
         auto& grad_op_output_names =
             OpMetaInfoHelper::GetOutputs(*grad_op_meta_ptr);
         bool is_double_grad_op =
-            (grad_op_name.find("_grad_grad") != grad_op_name.npos) ? true
-                                                                   : false;
+            (grad_op_name.find(paddle::framework::kDoubleGradSuffix) !=
+             grad_op_name.npos)
+                ? true
+                : false;
         for (auto& grad_op_output_name : grad_op_output_names) {
           auto fwd_input_name = paddle::framework::detail::NoGrad(
               grad_op_output_name, is_double_grad_op);
@@ -549,7 +551,7 @@ struct CustomOpInfoInterfaceModel : public OpYamlInfoInterface::Concept {
 struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
   static std::vector<std::vector<pir::Value>> CustomOpVjp(
       pir::Operation* op,
-      const std::vector<std::vector<pir::Value>>& inputs_,
+      const std::vector<std::vector<pir::Value>>& inputs,
       const std::vector<std::vector<pir::Value>>& outputs,
       const std::vector<std::vector<pir::Value>>& out_grads,
       const std::vector<std::vector<bool>>& stop_gradients) {
@@ -586,13 +588,13 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
     auto infershape_func = OpMetaInfoHelper::GetInferShapeFn(bwd_op_meta_info);
     auto inferdtype_func = OpMetaInfoHelper::GetInferDtypeFn(bwd_op_meta_info);
     PADDLE_ENFORCE_EQ(
-        inputs_.size(),
+        inputs.size(),
         fwd_inputs_name.size(),
         paddle::platform::errors::InvalidArgument(
             "Custom op: %s inputs size should be %d, but now is %d.",
             pir_op_name,
             fwd_inputs_name.size(),
-            inputs_.size()));
+            inputs.size()));
     PADDLE_ENFORCE_EQ(
         outputs.size(),
         fwd_outputs_name.size(),
@@ -610,9 +612,11 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
             pir_op_name,
             fwd_outputs_name.size(),
             out_grads.size()));
-
     bool is_double_grad_op =
-        (bwd_pir_op_name.find("_grad_grad") != pir_op_name.npos) ? true : false;
+        (bwd_pir_op_name.find(paddle::framework::kDoubleGradSuffix) !=
+         bwd_pir_op_name.npos)
+            ? true
+            : false;
     pir::IrContext* ctx = pir::IrContext::Instance();
     pir::OpInfo pir_info = ctx->GetRegisteredOpInfo(bwd_pir_op_name);
     pir::OperationArgument argument(pir_info);
@@ -664,7 +668,6 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
             grad_op_input_name));
       }
     };
-
     // Construct custom grad op inputs
     int input_index = 0;
     int vec_input_index = 0;
@@ -673,8 +676,8 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
       const auto input_location = GetInputLocation(bwd_input_name);
       std::vector<pir::Value> input_values;
       if (input_location.first == 0) {
-        // grad op input is in inputs_
-        input_values = inputs_[input_location.second];
+        // grad op input is in inputs
+        input_values = inputs[input_location.second];
       } else if (input_location.first == 1) {
         // grad op input is in outputs
         input_values = outputs[input_location.second];
@@ -682,32 +685,43 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
         // grad op input is in out_grads
         input_values = out_grads[input_location.second];
       }
-
-      if (input_values.size() > 1) {
+      if (paddle::framework::detail::IsDuplicableVar(bwd_input_name)) {
         std::vector<std::vector<int64_t>> tmp_input_shapes;
         std::vector<phi::DataType> tmp_input_dtypes;
+        pir::Value input_value;
         vec_input_name2id_map[bwd_input_name] = vec_input_index;
         vec_input_index++;
-        for (auto& input_value : input_values) {
-          paddle::dialect::DenseTensorType input_tensor =
-              input_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
-          tmp_input_shapes.push_back(phi::vectorize(input_tensor.dims()));
-          tmp_input_dtypes.push_back(
-              paddle::dialect::TransToPhiDataType(input_tensor.dtype()));
+        bool is_optional =
+            (input_values.size() == 1 && input_values[0].impl() == nullptr);
+        if (!is_optional) {
+          for (auto& input_value : input_values) {
+            paddle::dialect::DenseTensorType input_tensor =
+                input_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
+            tmp_input_shapes.push_back(phi::vectorize(input_tensor.dims()));
+            tmp_input_dtypes.push_back(
+                paddle::dialect::TransToPhiDataType(input_tensor.dtype()));
+          }
+          input_value = paddle::dialect::builtin_combine(input_values);
         }
         vec_input_shapes.push_back(tmp_input_shapes);
         vec_input_dtypes.push_back(tmp_input_dtypes);
-        auto input_value = paddle::dialect::builtin_combine(input_values);
         argument_inputs.push_back(input_value);
       } else {
+        std::vector<int64_t> tmp_input_shape;
+        phi::DataType tmp_input_dtype = DataType::UNDEFINED;
         input_name2id_map[bwd_input_name] = input_index;
         input_index++;
         pir::Value input_value = input_values[0];  // NOLINT
-        paddle::dialect::DenseTensorType input_tensor =
-            input_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
-        input_shapes.push_back(phi::vectorize(input_tensor.dims()));
-        input_dtypes.push_back(
-            paddle::dialect::TransToPhiDataType(input_tensor.dtype()));
+        if (input_value.impl() != nullptr) {
+          paddle::dialect::DenseTensorType input_tensor =
+              input_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
+          tmp_input_shape = phi::vectorize(input_tensor.dims());
+          tmp_input_dtype =
+              paddle::dialect::TransToPhiDataType(input_tensor.dtype());
+        }
+        input_shapes.push_back(tmp_input_shape);
+        input_dtypes.push_back(tmp_input_dtype);
+
         argument_inputs.push_back(input_value);
       }
     }
@@ -722,7 +736,6 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
       custom_attrs.push_back(paddle::dialect::TransAttrToAny(fwd_op_attr));
       argument.AddAttribute(fwd_attr_name, fwd_op_attr);
     }
-
     // Run Compile InferMeta
     std::vector<std::vector<int64_t>> output_shapes =
         paddle::framework::RunInferShape(infershape_func,
@@ -745,18 +758,23 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
     std::unordered_map<std::string, size_t> output_name2value_num;
     for (size_t i = 0; i < bwd_outputs_name.size(); ++i) {
       const auto& bwd_output_name = bwd_outputs_name.at(i);
+      const auto& bwd_input =
+          paddle::framework::detail::NoGrad(bwd_output_name, is_double_grad_op);
+
       if (paddle::framework::detail::IsDuplicableVar(bwd_output_name)) {
-        const auto& bwd_input = paddle::framework::detail::NoGrad(
-            bwd_output_name, is_double_grad_op);
         auto index = vec_input_name2id_map[bwd_input];
-        auto& input_shapes = vec_input_shapes[index];
-        output_name2value_num[bwd_output_name] = input_shapes.size();
-        all_values_num += input_shapes.size();
+        auto& vec_input_shape = vec_input_shapes[index];
+        output_name2value_num[bwd_output_name] = vec_input_shape.size();
       } else {
-        output_name2value_num[bwd_output_name] = 1;
-        all_values_num++;
+        auto index = input_name2id_map[bwd_input];
+        // input_shapes[index] is dim of tensor, if the dim doesn't have
+        // element, it must be a optional tensor that is None in custom operator
+        output_name2value_num[bwd_output_name] =
+            input_shapes[index].size() == 0 ? 0 : 1;
       }
+      all_values_num += output_name2value_num[bwd_output_name];
     }
+
     PADDLE_ENFORCE_EQ(
         output_shapes.size(),
         all_values_num,
@@ -778,13 +796,18 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
             "Tensors' dtype",
             all_values_num,
             output_dtypes.size()));
-
     // Construct custom grad op outputs
     size_t value_index = 0;
     for (size_t i = 0; i < bwd_outputs_name.size(); ++i) {
       const auto& bwd_output_name = bwd_outputs_name.at(i);
+      auto value_num = output_name2value_num[bwd_output_name];
+      if (value_num == 0) {
+        // Optional value condition
+        pir::Type out_type;
+        argument_outputs.push_back(out_type);
+        continue;
+      }
       if (paddle::framework::detail::IsDuplicableVar(bwd_output_name)) {
-        auto value_num = output_name2value_num[bwd_output_name];
         std::vector<pir::Type> out_types;
         for (size_t j = 0; j < value_num; ++j) {
           auto ddims = phi::make_ddim(output_shapes[value_index]);
@@ -820,6 +843,7 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
       }
     }
     argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
+
     // Build Operation
     std::vector<pir::Value> op_results;
     pir::Operation* bwd_op =
@@ -832,6 +856,42 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
     for (size_t i = 0; i < stop_gradients.size(); ++i) {
       res[i].resize(stop_gradients[i].size());
     }
+
+    auto GetInputGradientIndex = [&](const std::string& bwd_output_name,
+                                     bool is_double_grad_op) -> size_t {
+      /*
+        This function is used to get the index of input that need calculate
+        gradient in forward op. For example: forward inputs : TensorA, TensorB,
+        TensorC, TensorD backward outputs: TensorC@Grad, TensorA@Grad So, we
+        only need to calculate gradient of TensorA and TensorC and store them in
+        res; In this example, the res size is 2, and the first element of res
+        should store TensorA@Grad, and the second element of res should store
+        TensorC@Grad.
+
+        So, This function will return 1 if we pass TensorC@Grad and return 0 if
+        we pass TensorA@Grad.
+      */
+      size_t gradient_vec_index = 0;
+      const auto& fwd_input =
+          paddle::framework::detail::NoGrad(bwd_output_name, is_double_grad_op);
+      auto fwd_inputs_name_iter =
+          std::find(fwd_inputs_name.begin(), fwd_inputs_name.end(), fwd_input);
+      size_t input_index =
+          std::distance(fwd_inputs_name.begin(), fwd_inputs_name_iter);
+      for (size_t i = 0; i < input_index; ++i) {
+        for (size_t j = 0; j < bwd_outputs_name.size(); j++) {
+          const auto& fwd_input_name_tmp = paddle::framework::detail::NoGrad(
+              bwd_outputs_name[j], is_double_grad_op);
+          if (fwd_input_name_tmp == fwd_inputs_name[i]) {
+            // find forward input that need calculate gradient
+            gradient_vec_index++;
+            break;
+          }
+        }
+      }
+      return gradient_vec_index;
+    };
+
     // Build result and apply stop gradients
     for (size_t i = 0; i < bwd_outputs_name.size(); ++i) {
       const auto& bwd_output_name = bwd_outputs_name.at(i);
@@ -848,16 +908,20 @@ struct CustomOpVjpInterfaceModel : public VjpInterface::Concept {
                 "forward input that need calculate gradients.",
                 pir_op_name,
                 bwd_output_name));
-        int index =
-            std::distance(fwd_inputs_name.begin(), fwd_inputs_name_iter);
-        auto split_op =
-            ApiBuilder::Instance().GetBuilder()->Build<pir::SplitOp>(
-                bwd_op->result(i));
-        res[index] = split_op.outputs();
+        int index = GetInputGradientIndex(bwd_output_name, is_double_grad_op);
+        if (bwd_op->result(i).type().dyn_cast<pir::VectorType>()) {
+          auto split_op =
+              ApiBuilder::Instance().GetBuilder()->Build<pir::SplitOp>(
+                  bwd_op->result(i));
+          res[index] = split_op.outputs();
+        } else {
+          // optional output condition
+          pir::Value empty_value;
+          res[index][0] = empty_value;
+        }
       } else {
         if (fwd_inputs_name_iter != fwd_inputs_name.end()) {
-          int index =
-              std::distance(fwd_inputs_name.begin(), fwd_inputs_name_iter);
+          int index = GetInputGradientIndex(bwd_output_name, is_double_grad_op);
           res[index][0] = bwd_op->result(i);
         } else {
           // Situation that has only one input and only one output. If not meet
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index f7bdfabcbf75b..32020dc874cf3 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -147,123 +147,124 @@ static inline AttrType GetAttributeType(const pir::Attribute& attr) {
   }
 }
 
-static std::unordered_map<
-    AttrType,
-    std::function<VariantType(const pir::Attribute& attr)>>
-    kAttrCastMap = {
-        {AttrType::BOOL,
-         [](const pir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<pir::BoolAttribute>().data()};
-         }},
-        {AttrType::FLOAT,
-         [](const pir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<pir::FloatAttribute>().data()};
-         }},
-        {AttrType::DOUBLE,
-         [](const pir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<pir::DoubleAttribute>().data()};
-         }},
-        {AttrType::INT32,
-         [](const pir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<pir::Int32Attribute>().data()};
-         }},
-        {AttrType::INT64,
-         [](const pir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<pir::Int64Attribute>().data()};
-         }},
-        {AttrType::INT_ARRAY,
-         [](const pir::Attribute& attr) {
-           return VariantType{
-               attr.dyn_cast<paddle::dialect::IntArrayAttribute>()
-                   .data()
-                   .GetData()};
-         }},
-        {AttrType::STRING,
-         [](const pir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<pir::StrAttribute>().AsString()};
-         }},
-        {AttrType::DATA_TYPE,
-         [](const pir::Attribute& attr) {
-           return VariantType{
-               attr.dyn_cast<paddle::dialect::DataTypeAttribute>().data()};
-         }},
-        {AttrType::PLACE,
-         [](const pir::Attribute& attr) {
-           return VariantType{
-               attr.dyn_cast<paddle::dialect::PlaceAttribute>().data()};
-         }},
-        {AttrType::ARRAY,
-         [](const pir::Attribute& attr) {
-           auto attr_vec = attr.dyn_cast<pir::ArrayAttribute>().AsVector();
-           if (attr_vec.empty()) {
-             return VariantType{std::vector<int>()};
-           }
-           AttrType element_type = GetAttributeType(attr_vec[0]);
-
-           if (element_type == AttrType::BOOL) {
-             std::vector<bool> vec_bools;
-             vec_bools.reserve(attr_vec.size());
-             for (auto vec_element : attr_vec) {
-               vec_bools.push_back(
-                   vec_element.dyn_cast<pir::BoolAttribute>().data());
+template <typename T>
+static std::function<T(const pir::Attribute& attr)> GetAttrCast(
+    AttrType attr_type) {
+  std::unordered_map<AttrType, std::function<T(const pir::Attribute& attr)>>
+      kAttrCastMap = {
+          {AttrType::BOOL,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<pir::BoolAttribute>().data()};
+           }},
+          {AttrType::FLOAT,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<pir::FloatAttribute>().data()};
+           }},
+          {AttrType::DOUBLE,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<pir::DoubleAttribute>().data()};
+           }},
+          {AttrType::INT32,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<pir::Int32Attribute>().data()};
+           }},
+          {AttrType::INT64,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<pir::Int64Attribute>().data()};
+           }},
+          {AttrType::INT_ARRAY,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<paddle::dialect::IntArrayAttribute>()
+                          .data()
+                          .GetData()};
+           }},
+          {AttrType::STRING,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<pir::StrAttribute>().AsString()};
+           }},
+          {AttrType::DATA_TYPE,
+           [](const pir::Attribute& attr) {
+             return T{
+                 attr.dyn_cast<paddle::dialect::DataTypeAttribute>().data()};
+           }},
+          {AttrType::PLACE,
+           [](const pir::Attribute& attr) {
+             return T{attr.dyn_cast<paddle::dialect::PlaceAttribute>().data()};
+           }},
+          {AttrType::ARRAY,
+           [](const pir::Attribute& attr) {
+             auto attr_vec = attr.dyn_cast<pir::ArrayAttribute>().AsVector();
+             if (attr_vec.empty()) {
+               return T{std::vector<int>()};
              }
-             return VariantType{vec_bools};
-           } else if (element_type == AttrType::INT32) {
-             std::vector<int> vec_int32;
-             vec_int32.reserve(attr_vec.size());
-             for (auto vec_element : attr_vec) {
-               vec_int32.push_back(
-                   vec_element.dyn_cast<pir::Int32Attribute>().data());
+             AttrType element_type = GetAttributeType(attr_vec[0]);
+
+             if (element_type == AttrType::BOOL) {
+               std::vector<bool> vec_bools;
+               vec_bools.reserve(attr_vec.size());
+               for (auto vec_element : attr_vec) {
+                 vec_bools.push_back(
+                     vec_element.dyn_cast<pir::BoolAttribute>().data());
+               }
+               return T{vec_bools};
+             } else if (element_type == AttrType::INT32) {
+               std::vector<int> vec_int32;
+               vec_int32.reserve(attr_vec.size());
+               for (auto vec_element : attr_vec) {
+                 vec_int32.push_back(
+                     vec_element.dyn_cast<pir::Int32Attribute>().data());
+               }
+               return T{vec_int32};
+             } else if (element_type == AttrType::INT64) {
+               std::vector<int64_t> vec_int64;
+               vec_int64.reserve(attr_vec.size());
+               for (auto vec_element : attr_vec) {
+                 vec_int64.push_back(
+                     vec_element.dyn_cast<pir::Int64Attribute>().data());
+               }
+               return T{vec_int64};
+             } else if (element_type == AttrType::FLOAT) {
+               std::vector<float> vec_float;
+               vec_float.reserve(attr_vec.size());
+               for (auto vec_element : attr_vec) {
+                 vec_float.push_back(
+                     vec_element.dyn_cast<pir::FloatAttribute>().data());
+               }
+               return T{vec_float};
+             } else if (element_type == AttrType::DOUBLE) {
+               std::vector<double> vec_double;
+               vec_double.reserve(attr_vec.size());
+               for (auto vec_element : attr_vec) {
+                 vec_double.push_back(
+                     vec_element.dyn_cast<pir::DoubleAttribute>().data());
+               }
+               return T{vec_double};
+             } else if (element_type == AttrType::STRING) {
+               std::vector<std::string> vec_string;
+               vec_string.reserve(attr_vec.size());
+               for (auto vec_element : attr_vec) {
+                 vec_string.push_back(
+                     vec_element.dyn_cast<pir::StrAttribute>().AsString());
+               }
+               return T{vec_string};
+             } else {
+               PADDLE_THROW(phi::errors::Unimplemented(
+                   "Unsupported ir Attribute type when casting it into "
+                   "vector."));
              }
-             return VariantType{vec_int32};
-           } else if (element_type == AttrType::INT64) {
-             std::vector<int64_t> vec_int64;
-             vec_int64.reserve(attr_vec.size());
-             for (auto vec_element : attr_vec) {
-               vec_int64.push_back(
-                   vec_element.dyn_cast<pir::Int64Attribute>().data());
-             }
-             return VariantType{vec_int64};
-           } else if (element_type == AttrType::FLOAT) {
-             std::vector<float> vec_float;
-             vec_float.reserve(attr_vec.size());
-             for (auto vec_element : attr_vec) {
-               vec_float.push_back(
-                   vec_element.dyn_cast<pir::FloatAttribute>().data());
-             }
-             return VariantType{vec_float};
-           } else if (element_type == AttrType::DOUBLE) {
-             std::vector<double> vec_double;
-             vec_double.reserve(attr_vec.size());
-             for (auto vec_element : attr_vec) {
-               vec_double.push_back(
-                   vec_element.dyn_cast<pir::DoubleAttribute>().data());
-             }
-             return VariantType{vec_double};
-           } else if (element_type == AttrType::STRING) {
-             std::vector<std::string> vec_string;
-             vec_string.reserve(attr_vec.size());
-             for (auto vec_element : attr_vec) {
-               vec_string.push_back(
-                   vec_element.dyn_cast<pir::StrAttribute>().AsString());
-             }
-             return VariantType{vec_string};
-           } else {
-             PADDLE_THROW(phi::errors::Unimplemented(
-                 "Unsupported ir Attribute type when casting it into "
-                 "vector."));
-           }
-         }},
-};
+           }},
+      };
+  return kAttrCastMap[attr_type];
+}
 
 VariantType GetAttributeData(const pir::Attribute& attr) {
   AttrType attr_type = GetAttributeType(attr);
-  return kAttrCastMap[attr_type](attr);
+  return GetAttrCast<VariantType>(attr_type)(attr);
 }
 
 paddle::any TransAttrToAny(const pir::Attribute& attr) {
   AttrType attr_type = GetAttributeType(attr);
-  return kAttrCastMap[attr_type](attr);
+  return GetAttrCast<paddle::any>(attr_type)(attr);
 }
 
 bool IsLegacyOp(const std::string& name) { return LegacyOpList.count(name); }
@@ -481,6 +482,5 @@ std::vector<int64_t> ParseValueShape(const pir::Value& shape,
   }
   return vec_shape;
 }
-
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index ccb527aeecdcb..5980e061b5fb9 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -536,13 +536,17 @@ static PyObject *static_api_run_custom_op(PyObject *self,
       VLOG(7) << "Add un-initialized tensor "
                  "because the optional input is None";
       if (paddle::framework::detail::IsDuplicableVar(input)) {
-        vec_input_shapes.emplace_back();
-        vec_input_dtypes.emplace_back();
+        std::vector<std::vector<int64_t>> vec_input_shape;
+        std::vector<DataType> vec_input_dtype;
+        vec_input_shapes.emplace_back(vec_input_shape);
+        vec_input_dtypes.emplace_back(vec_input_dtype);
         vec_input_name2id_map[inputs[i]] = vec_input_index;
         vec_input_index++;
       } else {
-        input_shapes.emplace_back();
-        input_dtypes.emplace_back();
+        std::vector<int64_t> input_shape;
+        DataType input_dtype = DataType::UNDEFINED;
+        input_shapes.emplace_back(input_shape);
+        input_dtypes.emplace_back(input_dtype);
         input_name2id_map[inputs[i]] = input_index;
         input_index++;
       }
@@ -565,8 +569,10 @@ static PyObject *static_api_run_custom_op(PyObject *self,
       }
       vec_input_shapes.push_back(tmp_input_shapes);
       vec_input_dtypes.push_back(tmp_input_dtypes);
-      auto input_value = paddle::dialect::stack(input_values, /*axis*/ 0);
-      argument_inputs.push_back(input_value);
+      auto combine_op = paddle::dialect::ApiBuilder::Instance()
+                            .GetBuilder()
+                            ->Build<pir::CombineOp>(input_values);
+      argument_inputs.push_back(combine_op.out());
     } else {
       input_name2id_map[inputs[i]] = input_index;
       input_index++;
@@ -717,13 +723,20 @@ static PyObject *static_api_run_custom_op(PyObject *self,
               "`SetInplaceMap` in your output when registry custom operator."));
       const auto &input = inplace_reverse_map.at(output);
       auto index = vec_input_name2id_map[input];
-      auto &input_shapes = vec_input_shapes[index];
-      output_name2value_num[output] = input_shapes.size();
-      all_values_num += input_shapes.size();
+      auto &vec_input_shape = vec_input_shapes[index];
+      output_name2value_num[output] = vec_input_shape.size();
     } else {
-      output_name2value_num[output] = 1;
-      all_values_num++;
+      if (inplace_reverse_map.find(output) != inplace_reverse_map.end()) {
+        const auto &input = inplace_reverse_map.at(output);
+        auto index = input_name2id_map[input];
+        // input_shapes[index] is dim of tensor, if the dim doesn't have
+        // element, it must be a optional tensor that is None in custom operator
+        output_name2value_num[output] = input_shapes[index].size() == 0 ? 0 : 1;
+      } else {
+        output_name2value_num[output]++;
+      }
     }
+    all_values_num += output_name2value_num[output];
   }
 
   PADDLE_ENFORCE_EQ(
@@ -751,8 +764,14 @@ static PyObject *static_api_run_custom_op(PyObject *self,
   size_t value_index = 0;
   for (size_t i = 0; i < outputs.size(); ++i) {
     const auto &output = outputs.at(i);
+    auto value_num = output_name2value_num[output];
+    if (value_num == 0) {
+      // Optional value condition
+      pir::Type out_type;
+      argument_outputs.push_back(out_type);
+      continue;
+    }
     if (paddle::framework::detail::IsDuplicableVar(output)) {
-      auto value_num = output_name2value_num[output];
       std::vector<pir::Type> out_types;
       for (size_t j = 0; j < value_num; ++j) {
         auto ddims = phi::make_ddim(output_shapes[value_index]);
@@ -799,12 +818,14 @@ static PyObject *static_api_run_custom_op(PyObject *self,
   for (size_t i = 0; i < outputs.size(); ++i) {
     const auto &output = outputs.at(i);
     if (paddle::framework::detail::IsDuplicableVar(output)) {
-      auto split_op = paddle::dialect::ApiBuilder::Instance()
-                          .GetBuilder()
-                          ->Build<pir::SplitOp>(op->result(i));
-      auto split_outputs = split_op.outputs();
-      op_results.insert(
-          op_results.end(), split_outputs.begin(), split_outputs.end());
+      if (op->result(i).type().dyn_cast<pir::VectorType>()) {
+        auto split_op = paddle::dialect::ApiBuilder::Instance()
+                            .GetBuilder()
+                            ->Build<pir::SplitOp>(op->result(i));
+        auto split_outputs = split_op.outputs();
+        op_results.insert(
+            op_results.end(), split_outputs.begin(), split_outputs.end());
+      }
     } else {
       op_results.push_back(op->result(i));
     }
diff --git a/test/custom_op/test_custom_cast_op_jit.py b/test/custom_op/test_custom_cast_op_jit.py
index 8e8fe12203044..25da81129deff 100644
--- a/test/custom_op/test_custom_cast_op_jit.py
+++ b/test/custom_op/test_custom_cast_op_jit.py
@@ -25,6 +25,7 @@
 
 import paddle
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -71,14 +72,23 @@ def custom_cast_static(device, dtype, np_x):
             x.stop_gradient = False
             out = custom_module.custom_cast(x, dtype)
             static.append_backward(out)
-
+            if paddle.framework.in_pir_mode():
+                fetch_list = [
+                    out,
+                    static.default_main_program()
+                    .global_block()
+                    .ops[-1]
+                    .result(0),
+                ]
+            else:
+                fetch_list = [out, x.name + "@GRAD"]
             exe = static.Executor()
             exe.run(static.default_startup_program())
             # in static graph mode, x data has been covered by out
             out_v, x_grad_v = exe.run(
                 static.default_main_program(),
                 feed={'X': np_x},
-                fetch_list=[out.name, x.name + "@GRAD"],
+                fetch_list=fetch_list,
             )
 
             assert x_grad_v[0].dtype == dtype
@@ -92,6 +102,7 @@ class TestCustomCastOp(unittest.TestCase):
     def setUp(self):
         self.dtypes = ['float32', 'float64']
 
+    @test_with_pir_api
     def test_static(self):
         for dtype in self.dtypes:
             x = np.random.uniform(-1, 1, [4, 8]).astype("float32")
diff --git a/test/custom_op/test_custom_concat.py b/test/custom_op/test_custom_concat.py
index 153ca92a46def..ea6496647972e 100644
--- a/test/custom_op/test_custom_concat.py
+++ b/test/custom_op/test_custom_concat.py
@@ -20,6 +20,7 @@
 
 import paddle
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -94,10 +95,19 @@ def concat_static(func, dtype, np_inputs, axis_v, with_attr=False):
                     "x2": np_inputs[1].astype(dtype),
                     "axis": axis,
                 }
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                fetch_list = [
+                    out,
+                    ops[-1].result(0),  # x1_grad
+                    ops[-1].result(1),
+                ]  # x2_grad
+            else:
+                fetch_list = [out.name, x1.name + "@GRAD", x2.name + "@GRAD"]
             out_v, x1_grad_v, x2_grad_v = exe.run(
                 static.default_main_program(),
                 feed=feed_dict,
-                fetch_list=[out.name, x1.name + "@GRAD", x2.name + "@GRAD"],
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return out_v, x1_grad_v, x2_grad_v
@@ -133,6 +143,7 @@ def test_dynamic(self):
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
                     self.check_output(x_grad, pd_x_grad, "x_grad")
 
+    @test_with_pir_api
     def test_static(self):
         for dtype in self.dtypes:
             for axis in self.axises:
@@ -165,6 +176,7 @@ def test_dynamic_with_attr(self):
                 for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
                     self.check_output(x_grad, pd_x_grad, "x_grad")
 
+    @test_with_pir_api
     def test_static_with_attr(self):
         for dtype in self.dtypes:
             for axis in self.axises:
diff --git a/test/custom_op/test_custom_conj.py b/test/custom_op/test_custom_conj.py
index 846fafe4092c6..73760421c8018 100644
--- a/test/custom_op/test_custom_conj.py
+++ b/test/custom_op/test_custom_conj.py
@@ -20,6 +20,7 @@
 
 import paddle
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -83,10 +84,16 @@ def conj_static(func, shape, dtype, np_input):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                fetch_list = [out, ops[-1].result(0)]
+            else:
+                fetch_list = [out.name, x.name + "@GRAD"]
+
             out_v, x_grad_v = exe.run(
                 static.default_main_program(),
                 feed={"x": np_input},
-                fetch_list=[out.name, x.name + "@GRAD"],
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return out_v, x_grad_v
@@ -106,6 +113,7 @@ def test_dynamic(self):
             check_output(out, pd_out, "out")
             check_output(x_grad, pd_x_grad, "x's grad")
 
+    @test_with_pir_api
     def test_static(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
diff --git a/test/custom_op/test_custom_inplace.py b/test/custom_op/test_custom_inplace.py
index f5eed712cdcf9..105bbf65ae29d 100644
--- a/test/custom_op/test_custom_inplace.py
+++ b/test/custom_op/test_custom_inplace.py
@@ -26,6 +26,7 @@
 
 import paddle
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -76,19 +77,31 @@ def inplace_static_add(func, device, dtype, np_x, np_y):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                fetch_list = [
+                    x,
+                    out,
+                    ops[-1].result(0),
+                    ops[-1].result(1),
+                    ops[-2].result(0),
+                ]
+            else:
+                fetch_list = [
+                    x.name,
+                    out.name,
+                    x.name + "@GRAD",
+                    y.name + "@GRAD",
+                    out.name + "@GRAD",
+                ]
+
             x_v, out_v, x_grad_v, y_grad_v, out_grad_v = exe.run(
                 static.default_main_program(),
                 feed={
                     "x": np_x.astype(dtype),
                     "y": np_y.astype(dtype),
                 },
-                fetch_list=[
-                    x.name,
-                    out.name,
-                    x.name + "@GRAD",
-                    y.name + "@GRAD",
-                    out.name + "@GRAD",
-                ],
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return x_v, out_v, x_grad_v, y_grad_v, out_grad_v
@@ -142,6 +155,39 @@ def inplace_static_add_vector(custom_func, device, dtype, np_inputs, np_y):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                if custom_func:
+                    fetch_list = [
+                        out[0],
+                        out[1],
+                        ops[-1].result(0),  # x1_grad
+                        ops[-1].result(1),  # x2_grad
+                        ops[-2].result(1),  # y_grad
+                        ops[-5].result(0),  # out0_grad
+                        ops[-5].result(1),
+                    ]  # out1_grad
+                else:
+                    fetch_list = [
+                        out[0],
+                        out[1],
+                        ops[-4].result(0),  # x1_grad
+                        ops[-3].result(0),  # x2_grad
+                        ops[-1].result(0),  # y_grad
+                        ops[-5].result(0),  # out0_grad
+                        ops[-5].result(1),
+                    ]  # out1_grad
+            else:
+                fetch_list = [
+                    out[0].name,
+                    out[1].name,
+                    x1.name + "@GRAD",
+                    x2.name + "@GRAD",
+                    y.name + "@GRAD",
+                    out[0].name + "@GRAD",
+                    out[1].name + "@GRAD",
+                ]
+
             (
                 out0_v,
                 out1_v,
@@ -157,15 +203,7 @@ def inplace_static_add_vector(custom_func, device, dtype, np_inputs, np_y):
                     "x2": np_inputs[1].astype(dtype),
                     "y": np_y.astype(dtype),
                 },
-                fetch_list=[
-                    out[0].name,
-                    out[1].name,
-                    x1.name + "@GRAD",
-                    x2.name + "@GRAD",
-                    y.name + "@GRAD",
-                    out[0].name + "@GRAD",
-                    out[1].name + "@GRAD",
-                ],
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return (
@@ -216,6 +254,24 @@ def inplace_static_relu_net(func, device, dtype, np_x, np_y, np_z):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                fetch_list = [
+                    x,
+                    y,
+                    out,
+                    ops[-1].result(0),  # x_grad
+                    ops[-1].result(1),
+                ]  # y_grad
+            else:
+                fetch_list = [
+                    x.name,
+                    y.name,
+                    out.name,
+                    x.name + "@GRAD",
+                    y.name + "@GRAD",
+                ]
+
             x_v, y_v, out_v, x_grad_v, y_grad_v = exe.run(
                 static.default_main_program(),
                 feed={
@@ -223,13 +279,7 @@ def inplace_static_relu_net(func, device, dtype, np_x, np_y, np_z):
                     "y": np_y.astype(dtype),
                     "z": np_z.astype(dtype),
                 },
-                fetch_list=[
-                    x.name,
-                    y.name,
-                    out.name,
-                    x.name + "@GRAD",
-                    y.name + "@GRAD",
-                ],
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return x_v, y_v, out_v, x_grad_v, y_grad_v
@@ -284,6 +334,49 @@ def static_multi_inplace(custom_func, device, dtype, np_x, np_y, np_a, np_b):
             mean_out = paddle.mean(paddle.add(out_xy, out_ab))
             static.append_backward(mean_out)
 
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                if custom_func:
+                    fetch_list = [
+                        x,
+                        out_xy,
+                        ops[-1].result(0),  # x_grad
+                        ops[-1].result(1),  # y_grad
+                        ops[-2].result(0),  # out_xy_grad
+                        a,
+                        out_ab,
+                        ops[-1].result(2),  # a_grad
+                        ops[-1].result(3),  # b_grad
+                        ops[-2].result(1),
+                    ]  # out_ab_grad
+                else:
+                    fetch_list = [
+                        x,
+                        out_xy,
+                        ops[-2].result(0),  # x_grad
+                        ops[-2].result(1),  # y_grad
+                        ops[-3].result(0),  # out_xy_grad
+                        a,
+                        out_ab,
+                        ops[-1].result(0),  # a_grad
+                        ops[-1].result(1),  # b_grad
+                        ops[-3].result(1),
+                    ]  # out_ab_grad
+
+            else:
+                fetch_list = [
+                    x.name,
+                    out_xy.name,
+                    x.name + "@GRAD",
+                    y.name + "@GRAD",
+                    out_xy.name + "@GRAD",
+                    a.name,
+                    out_ab.name,
+                    a.name + "@GRAD",
+                    b.name + "@GRAD",
+                    out_ab.name + "@GRAD",
+                ]
+
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
@@ -306,18 +399,7 @@ def static_multi_inplace(custom_func, device, dtype, np_x, np_y, np_a, np_b):
                     "a": np_a.astype(dtype),
                     "b": np_b.astype(dtype),
                 },
-                fetch_list=[
-                    x.name,
-                    out_xy.name,
-                    x.name + "@GRAD",
-                    y.name + "@GRAD",
-                    out_xy.name + "@GRAD",
-                    a.name,
-                    out_ab.name,
-                    a.name + "@GRAD",
-                    b.name + "@GRAD",
-                    out_ab.name + "@GRAD",
-                ],
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return (
@@ -348,6 +430,7 @@ def setUp(self):
             np.random.random((3, 2)).astype("float32"),
         ]
 
+    @test_with_pir_api
     def test_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -426,6 +509,7 @@ def test_dynamic_add(self):
                 check_output(custom_x_grad, pd_x_grad, "x_grad")
                 check_output(custom_y_grad, pd_y_grad, "y_grad")
 
+    @test_with_pir_api
     def test_static_add_vector(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -498,6 +582,7 @@ def test_dynamic_add_vector(self):
                 check_output(custom_x_grad, pd_x_grad, "x_grad")
                 check_output(custom_y_grad, pd_y_grad, "y_grad")
 
+    @test_with_pir_api
     def test_static_relu_net(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -573,6 +658,7 @@ def test_dynamic_relu_net(self):
                 check_output(custom_x_grad, pd_x_grad, "x_grad")
                 check_output(custom_y_grad, pd_y_grad, "y_grad")
 
+    @test_with_pir_api
     def test_static_multi_inplace(self):
         for device in self.devices:
             for dtype in self.dtypes:
diff --git a/test/custom_op/test_custom_linear.py b/test/custom_op/test_custom_linear.py
index 60a881bdb6a0c..9ec08138ab544 100644
--- a/test/custom_op/test_custom_linear.py
+++ b/test/custom_op/test_custom_linear.py
@@ -21,6 +21,7 @@
 import paddle
 import paddle.nn.functional as F
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -71,6 +72,30 @@ def linear_static(func, device, dtype, np_x, np_weight, np_bias):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                if func.__name__ == "custom_linear":
+                    fetch_list = [
+                        out,
+                        ops[-1].result(0),  # x_grad
+                        ops[-1].result(1),  # weight_grad
+                        ops[-1].result(2),
+                    ]  # bias_grad
+                else:
+                    fetch_list = [
+                        out,
+                        ops[-1].result(0),  # x_grad
+                        ops[-1].result(1),  # weight_grad
+                        ops[-2].result(1),
+                    ]  # bias_grad
+            else:
+                fetch_list = [
+                    out.name,
+                    x.name + "@GRAD",
+                    weight.name + "@GRAD",
+                    bias.name + "@GRAD",
+                ]
+
             out_v, x_grad_v, weight_grad_v, bias_grad_v = exe.run(
                 static.default_main_program(),
                 feed={
@@ -78,12 +103,7 @@ def linear_static(func, device, dtype, np_x, np_weight, np_bias):
                     "weight": np_weight.astype(dtype),
                     "bias": np_bias.astype(dtype),
                 },
-                fetch_list=[
-                    out.name,
-                    x.name + "@GRAD",
-                    weight.name + "@GRAD",
-                    bias.name + "@GRAD",
-                ],
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return out_v, x_grad_v, weight_grad_v, bias_grad_v
@@ -99,6 +119,7 @@ def setUp(self):
         self.np_weight = np.full([2, 4], fill_value=0.5, dtype="float32")
         self.np_bias = np.ones([4], dtype="float32")
 
+    @test_with_pir_api
     def test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
diff --git a/test/custom_op/test_custom_optional.py b/test/custom_op/test_custom_optional.py
index 7eee74ca0066c..69ed387b06b9c 100644
--- a/test/custom_op/test_custom_optional.py
+++ b/test/custom_op/test_custom_optional.py
@@ -20,6 +20,7 @@
 
 import paddle
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -92,14 +93,20 @@ def optional_static_add(custom_func, device, dtype, np_x, np_y):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
-            x_v, out_v, x_grad_v = exe.run(
-                static.default_main_program(),
-                feed=feed_dict,
-                fetch_list=[
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                fetch_list = [x, out, ops[-1].result(0)]
+            else:
+                fetch_list = [
                     x.name,
                     out.name,
                     x.name + "@GRAD",
-                ],
+                ]
+
+            x_v, out_v, x_grad_v = exe.run(
+                static.default_main_program(),
+                feed=feed_dict,
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return x_v, out_v, x_grad_v
@@ -195,29 +202,52 @@ def optional_inplace_static_add(custom_func, device, dtype, np_x, np_y):
 
             exe = static.Executor()
             exe.run(static.default_startup_program())
-
             if np_y is not None:
-                x_v, out_v, x_grad_v, y_grad_v = exe.run(
-                    static.default_main_program(),
-                    feed=feed_dict,
-                    fetch_list=[
+                if paddle.framework.in_pir_mode():
+                    ops = static.default_main_program().global_block().ops
+                    if custom_func:
+                        fetch_list = [
+                            x,
+                            out,
+                            ops[-1].result(0),  # x_grad
+                            ops[-1].result(1),
+                        ]  # y_grad
+                    else:
+                        fetch_list = [
+                            x,
+                            out,
+                            ops[-1].result(0),  # x_grad
+                            ops[-3].result(0),
+                        ]  # y_grad
+                else:
+                    fetch_list = [
                         x.name,
                         out.name,
                         x.name + "@GRAD",
                         y.name + "@GRAD",
-                    ],
+                    ]
+                x_v, out_v, x_grad_v, y_grad_v = exe.run(
+                    static.default_main_program(),
+                    feed=feed_dict,
+                    fetch_list=fetch_list,
                 )
                 paddle.disable_static()
                 return [x_v, out_v, x_grad_v, y_grad_v]
             else:
-                x_v, out_v, x_grad_v = exe.run(
-                    static.default_main_program(),
-                    feed=feed_dict,
-                    fetch_list=[
+                if paddle.framework.in_pir_mode():
+                    ops = static.default_main_program().global_block().ops
+                    fetch_list = [x, out, ops[-1].result(0)]
+
+                else:
+                    fetch_list = [
                         x.name,
                         out.name,
                         x.name + "@GRAD",
-                    ],
+                    ]
+                x_v, out_v, x_grad_v = exe.run(
+                    static.default_main_program(),
+                    feed=feed_dict,
+                    fetch_list=fetch_list,
                 )
                 paddle.disable_static()
                 return [x_v, out_v, x_grad_v]
@@ -288,14 +318,21 @@ def optional_vector_static_add(custom_func, device, dtype, np_x, np_inputs):
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
-            x_v, out_v, x_grad_v = exe.run(
-                static.default_main_program(),
-                feed=feed_dict,
-                fetch_list=[
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                fetch_list = [x, out, ops[-1].result(0)]
+
+            else:
+                fetch_list = [
                     x.name,
                     out.name,
                     x.name + "@GRAD",
-                ],
+                ]
+
+            x_v, out_v, x_grad_v = exe.run(
+                static.default_main_program(),
+                feed=feed_dict,
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return x_v, out_v, x_grad_v
@@ -427,28 +464,53 @@ def optional_inplace_vector_static_add(
             exe.run(static.default_startup_program())
 
             if np_inputs is not None:
-                x_v, out_v, x_grad_v, y1_grad_v, y2_grad_v = exe.run(
-                    static.default_main_program(),
-                    feed=feed_dict,
-                    fetch_list=[
+                if paddle.framework.in_pir_mode():
+                    ops = static.default_main_program().global_block().ops
+                    if custom_func:
+                        fetch_list = [
+                            x,
+                            out,
+                            ops[-2].result(0),  # x_grad
+                            ops[-1].result(0),  # y1_grad
+                            ops[-1].result(1),
+                        ]  # y2_grad
+                    else:
+                        fetch_list = [
+                            x,
+                            out,
+                            ops[-1].result(0),  # x_grad
+                            ops[-3].result(0),  # y1_grad
+                            ops[-6].result(0),
+                        ]  # y2_grad
+                else:
+                    fetch_list = [
                         x.name,
                         out.name,
                         x.name + "@GRAD",
                         y1.name + "@GRAD",
                         y2.name + "@GRAD",
-                    ],
+                    ]
+                x_v, out_v, x_grad_v, y1_grad_v, y2_grad_v = exe.run(
+                    static.default_main_program(),
+                    feed=feed_dict,
+                    fetch_list=fetch_list,
                 )
                 paddle.disable_static()
                 return [x_v, out_v, x_grad_v, y1_grad_v, y2_grad_v]
             else:
-                x_v, out_v, x_grad_v = exe.run(
-                    static.default_main_program(),
-                    feed=feed_dict,
-                    fetch_list=[
+                if paddle.framework.in_pir_mode():
+                    ops = static.default_main_program().global_block().ops
+                    fetch_list = [x, out, ops[-1].result(0)]  # y_grad
+                else:
+                    fetch_list = [
                         x.name,
                         out.name,
                         x.name + "@GRAD",
-                    ],
+                    ]
+                x_v, out_v, x_grad_v = exe.run(
+                    static.default_main_program(),
+                    feed=feed_dict,
+                    fetch_list=fetch_list,
                 )
                 paddle.disable_static()
                 return [x_v, out_v, x_grad_v]
@@ -465,6 +527,7 @@ def setUp(self):
             np.random.random((3, 2)).astype("float32"),
         ]
 
+    @test_with_pir_api
     def test_optional_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -527,6 +590,7 @@ def test_optional_dynamic_add(self):
                     check_output(custom_out, pd_out, "out")
                     check_output(custom_x_grad, pd_x_grad, "x_grad")
 
+    @test_with_pir_api
     def test_optional_inplace_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -598,6 +662,7 @@ def test_optional_inplace_dynamic_add(self):
                     check_output(custom_x_grad, pd_x_grad, "x_grad")
                     check_output(custom_y_grad, pd_y_grad, "y_grad")
 
+    @test_with_pir_api
     def test_optional_vector_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -660,6 +725,7 @@ def test_optional_vector_dynamic_add(self):
                     check_output(custom_out, pd_out, "out")
                     check_output(custom_x_grad, pd_x_grad, "x_grad")
 
+    @test_with_pir_api
     def test_optional_inplace_vector_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
diff --git a/test/custom_op/test_custom_tensor_operator.py b/test/custom_op/test_custom_tensor_operator.py
index 8460bd2dba95a..b78b71a055c13 100644
--- a/test/custom_op/test_custom_tensor_operator.py
+++ b/test/custom_op/test_custom_tensor_operator.py
@@ -25,6 +25,7 @@
 
 import paddle
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -35,6 +36,14 @@
     cmd = f'del {file}'
     run_cmd(cmd, True)
 
+custom_module = load(
+    name='custom_tensor_operator',
+    sources=['custom_tensor_operator.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cxx_cflags=extra_cc_args,  # test for cc flags
+    verbose=True,
+)
+
 
 def test_custom_add_dynamic(func, device, dtype, np_x, use_func=True):
     paddle.set_device(device)
@@ -74,7 +83,7 @@ def test_custom_add_static(func, device, dtype, np_x, use_func=True):
             out_v = exe.run(
                 static.default_main_program(),
                 feed={'X': np_x},
-                fetch_list=[out.name],
+                fetch_list=[out],
             )
 
     paddle.disable_static()
@@ -119,7 +128,7 @@ def test_custom_subtract_static(func, device, dtype, np_x, use_func=True):
             out_v = exe.run(
                 static.default_main_program(),
                 feed={'X': np_x},
-                fetch_list=[out.name],
+                fetch_list=[out],
             )
 
     paddle.disable_static()
@@ -164,7 +173,7 @@ def test_custom_multiply_static(func, device, dtype, np_x, use_func=True):
             out_v = exe.run(
                 static.default_main_program(),
                 feed={'X': np_x},
-                fetch_list=[out.name],
+                fetch_list=[out],
             )
 
     paddle.disable_static()
@@ -208,7 +217,7 @@ def test_custom_divide_static(func, device, dtype, np_x, use_func=True):
             out_v = exe.run(
                 static.default_main_program(),
                 feed={'X': np_x},
-                fetch_list=[out.name],
+                fetch_list=[out],
             )
 
     paddle.disable_static()
@@ -217,41 +226,50 @@ def test_custom_divide_static(func, device, dtype, np_x, use_func=True):
 
 class TestJITLoad(unittest.TestCase):
     def setUp(self):
-        self.custom_module = load(
-            name='custom_tensor_operator',
-            sources=['custom_tensor_operator.cc'],
-            extra_include_paths=paddle_includes,  # add for Coverage CI
-            extra_cxx_cflags=extra_cc_args,  # test for cc flags
-            verbose=True,
-        )
+        self.custom_module = custom_module
         self.devices = ['cpu']
         self.dtypes = ['float32', 'float64']
         if paddle.is_compiled_with_cuda():
             self.devices.append('gpu')
             self.dtypes.append('float16')
 
-    def test_all(self):
+    def test_dynamic(self):
         self.add = self.custom_module.custom_add
         self.subtract = self.custom_module.custom_subtract
         self.multiply = self.custom_module.custom_multiply
         self.divide = self.custom_module.custom_divide
-        self._test_static()
         self._test_dynamic()
         self.add = self.custom_module.custom_scalar_add
         self.subtract = self.custom_module.custom_scalar_subtract
         self.multiply = self.custom_module.custom_scalar_multiply
         self.divide = self.custom_module.custom_scalar_divide
-        self._test_static()
         self._test_dynamic()
         self.add = self.custom_module.custom_left_scalar_add
         self.subtract = self.custom_module.custom_left_scalar_subtract
         self.multiply = self.custom_module.custom_left_scalar_multiply
         self.divide = self.custom_module.custom_left_scalar_divide
-        self._test_static()
         self._test_dynamic()
         self._test_logical_operants()
         self._test_compare_operants()
 
+    @test_with_pir_api
+    def test_static(self):
+        self.add = self.custom_module.custom_add
+        self.subtract = self.custom_module.custom_subtract
+        self.multiply = self.custom_module.custom_multiply
+        self.divide = self.custom_module.custom_divide
+        self._test_static()
+        self.add = self.custom_module.custom_scalar_add
+        self.subtract = self.custom_module.custom_scalar_subtract
+        self.multiply = self.custom_module.custom_scalar_multiply
+        self.divide = self.custom_module.custom_scalar_divide
+        self._test_static()
+        self.add = self.custom_module.custom_left_scalar_add
+        self.subtract = self.custom_module.custom_left_scalar_subtract
+        self.multiply = self.custom_module.custom_left_scalar_multiply
+        self.divide = self.custom_module.custom_left_scalar_divide
+        self._test_static()
+
     def _test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
diff --git a/test/custom_op/test_multi_out_jit.py b/test/custom_op/test_multi_out_jit.py
index c64c424e393b0..3721a40f3f05b 100644
--- a/test/custom_op/test_multi_out_jit.py
+++ b/test/custom_op/test_multi_out_jit.py
@@ -20,6 +20,7 @@
 
 import paddle
 from paddle import static
+from paddle.pir_utils import test_with_pir_api
 from paddle.utils.cpp_extension import get_build_directory, load
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
@@ -69,14 +70,37 @@ def discrete_out_static(use_custom, device, dtype, np_w, np_x, np_y, np_z):
             y.stop_gradient = False
             z.stop_gradient = False
             if use_custom:
+                print(static.default_main_program())
                 out = multi_out_module.discrete_out(w, x, y, z)
+                print(static.default_main_program())
             else:
                 out = w * 1 + x * 2 + y * 3 + z * 4
             static.append_backward(out)
-
+            print(static.default_main_program())
             exe = static.Executor()
             exe.run(static.default_startup_program())
 
+            if paddle.framework.in_pir_mode():
+                ops = static.default_main_program().global_block().ops
+                if use_custom:
+                    fetch_list = [
+                        out,
+                        ops[-1].result(0),  # w_grad
+                        ops[-1].result(1),
+                    ]  # y_grad
+                else:
+                    fetch_list = [
+                        out,
+                        ops[-2].result(0),  # w_grad
+                        ops[-3].result(0),
+                    ]  # y_grad
+            else:
+                fetch_list = [
+                    out.name,
+                    w.name + "@GRAD",
+                    y.name + "@GRAD",
+                ]
+
             out_v, w_grad_v, y_grad_v = exe.run(
                 static.default_main_program(),
                 feed={
@@ -85,11 +109,7 @@ def discrete_out_static(use_custom, device, dtype, np_w, np_x, np_y, np_z):
                     "y": np_y.astype(dtype),
                     "z": np_z.astype(dtype),
                 },
-                fetch_list=[
-                    out.name,
-                    w.name + "@GRAD",
-                    y.name + "@GRAD",
-                ],
+                fetch_list=fetch_list,
             )
     paddle.disable_static()
     return out_v, w_grad_v, y_grad_v
@@ -138,6 +158,7 @@ def check_multi_outputs(self, outs, is_dynamic=False):
         self.assertTrue('int32' in str(one_int32.dtype))
         check_output(one_int32, np.ones([4, 8]).astype('int32'), "one_int32")
 
+    @test_with_pir_api
     def test_multi_out_static(self):
         paddle.enable_static()
         for device in self.devices:
@@ -157,6 +178,7 @@ def test_multi_out_dynamic(self):
                 self.assertTrue(len(outs) == 3)
                 self.check_multi_outputs(outs, True)
 
+    @test_with_pir_api
     def test_discrete_out_static(self):
         for device in self.devices:
             for dtype in self.dtypes:

From f8fbbb50fab0ab34c0d2835a762f6419f7f1c881 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 11 Mar 2024 11:31:49 +0800
Subject: [PATCH 109/114]  Fix precedding_nodes preceding_nodes (#62544)

---
 paddle/fluid/eager/backward.cc                |  4 +-
 paddle/fluid/eager/general_grad.h             | 56 +++++++++----------
 .../fluid/framework/details/op_handle_base.h  |  4 +-
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 027ebba18be96..33d945d29a4a3 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -286,8 +286,8 @@ std::vector<paddle::Tensor> RunBackward(
             node_input_buffer->Buffers(), create_graph, is_general_grad);
 
     if (!inputs.empty() && is_general_grad) {
-      GeneralGrad::Instance().SetResultForEnddingNodes(grad_output_tensors,
-                                                       node);
+      GeneralGrad::Instance().SetResultForEndingNodes(grad_output_tensors,
+                                                      node);
     }
 
     // retain_grad or not
diff --git a/paddle/fluid/eager/general_grad.h b/paddle/fluid/eager/general_grad.h
index 443455619cae6..180e73ca81cfa 100644
--- a/paddle/fluid/eager/general_grad.h
+++ b/paddle/fluid/eager/general_grad.h
@@ -124,15 +124,15 @@ class GeneralGrad {
       }
       visited.insert(target_node);
       if (!(depending_nodes_)[target_node].empty()) {
-        auto precedding_nodes = (depending_nodes_)[target_node];
-        for (auto pre_nodes : precedding_nodes) {
+        auto preceding_nodes = (depending_nodes_)[target_node];
+        for (auto pre_nodes : preceding_nodes) {
           queue.push_back(pre_nodes);
           needed_nodes_.emplace(pre_nodes);
           if (IsInputTargetNodes(pre_nodes)) {
             input_target_nodes_on_path.emplace(pre_nodes);
           }
         }
-      } else {  // startup_ops have no precedding nodes
+      } else {  // startup_ops have no preceding nodes
         VLOG(6) << "Emplace startup_ops";
         startup_ops.emplace(target_node);
         needed_nodes_.emplace(target_node);
@@ -143,7 +143,7 @@ class GeneralGrad {
          input_target_nodes_inputmeta_map_) {
       if (!input_target_nodes_on_path.count(
               target_nodes_inputmeta_pair.first)) {
-        endding_nodes_.emplace(target_nodes_inputmeta_pair.first);
+        ending_nodes_.emplace(target_nodes_inputmeta_pair.first);
       }
     }
 
@@ -236,12 +236,12 @@ class GeneralGrad {
     }  // TODO(jiabin): Some check here.
   }
 
-  void SetResultForEnddingNodes(
+  void SetResultForEndingNodes(
       paddle::small_vector<std::vector<paddle::Tensor>, kSlotSmallVectorSize>
           grad_output,
       GradNodeBase* node) {
-    if (IsEnddingNodes(node)) {
-      VLOG(6) << "Set result for endding_nodes_ with grad_output_tensors";
+    if (IsEndingNodes(node)) {
+      VLOG(6) << "Set result for ending_nodes_ with grad_output_tensors";
       results_map_[node] = std::make_shared<paddle::Tensor>(grad_output[0][0]);
     }
   }
@@ -275,9 +275,9 @@ class GeneralGrad {
   }
 
   // Register Hook to fetch input's gradients, when input's grad node is not an
-  // endding node in backward graph. If input's grad node is an endding node in
+  // ending node in backward graph. If input's grad node is an ending node in
   // backward graph, use grad node's output as inputs' gradients and no need to
-  // register Hook. Please note that endding node must be GradNodeAccumulation
+  // register Hook. Please note that ending node must be GradNodeAccumulation
   // after ModifyBackwardGraph function.
   void RegisterFetchGradHook(const std::vector<paddle::Tensor>& inputs) {
     VLOG(6) << "Running in RegisterFetchGradHook.";
@@ -296,8 +296,8 @@ class GeneralGrad {
 
         if (orig_to_copied_node_map_.count(target_node)) {
           target_node = orig_to_copied_node_map_[target_node].get();
-          if (copied_node_to_endding_node_map_.count(target_node)) {
-            VLOG(6) << "No need to call FetchGradForTensor for endding_nodes";
+          if (copied_node_to_ending_node_map_.count(target_node)) {
+            VLOG(6) << "No need to call FetchGradForTensor for ending_nodes";
             continue;
           }
         }
@@ -309,7 +309,7 @@ class GeneralGrad {
                 "stop_gradient=True.",
                 i));
 
-        if (!IsEnddingNodes(target_node)) {
+        if (!IsEndingNodes(target_node)) {
           // Fetch grad for tensor in target_node on path.
           auto fetched_grad = FetchGradForTensor(inputs[i], target_node);
           results_map_[target_node] = fetched_grad;
@@ -321,9 +321,9 @@ class GeneralGrad {
   void SetNodeToAccumulationNode(GradNodeBase* node) {
     if (dynamic_cast<egr::GradNodeAccumulation*>(node)) return;
     if (!(depending_nodes_)[node].empty()) {
-      // Find precedding_nodes of current node.
-      auto precedding_nodes = (depending_nodes_)[node];
-      for (auto pre_nodes : precedding_nodes) {
+      // Find preceding_nodes of current node.
+      auto preceding_nodes = (depending_nodes_)[node];
+      for (auto pre_nodes : preceding_nodes) {
         paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
             pre_nodes_edges = pre_nodes->MutableOutputMeta();
         for (size_t i = 0; i < pre_nodes_edges.size(); i++) {
@@ -332,21 +332,21 @@ class GeneralGrad {
             if (edge_.GetGradNode() == node) {
               Edge& pre_node_edge = pre_nodes_edges[i][j].GetMutableEdge();
 
-              if (copied_node_to_endding_node_map_.count(node)) {
+              if (copied_node_to_ending_node_map_.count(node)) {
                 pre_node_edge.SetGradNode(
-                    copied_node_to_endding_node_map_[node]);
+                    copied_node_to_ending_node_map_[node]);
               } else {
                 auto autograd_meta = egr::AutogradMeta(edge_);
                 std::shared_ptr<GradNodeBase> shared_grad_node_accumulation =
                     std::make_shared<egr::GradNodeAccumulation>(&autograd_meta);
                 pre_node_edge.SetGradNode(shared_grad_node_accumulation);
-                copied_node_to_endding_node_map_[node] =
+                copied_node_to_ending_node_map_[node] =
                     shared_grad_node_accumulation;
               }
 
               auto* grad_node = pre_node_edge.GetGradNode();
               needed_nodes_.emplace(grad_node);
-              endding_nodes_.emplace(grad_node);
+              ending_nodes_.emplace(grad_node);
               input_target_nodes_inputmeta_map_[grad_node] =
                   input_target_nodes_inputmeta_map_[node];
 
@@ -384,7 +384,7 @@ class GeneralGrad {
       }
       visited.insert(node);
 
-      if (IsInputTargetNodes(node) && IsEnddingNodes(node)) {
+      if (IsInputTargetNodes(node) && IsEndingNodes(node)) {
         SetNodeToAccumulationNode(node);
         continue;
       }
@@ -413,7 +413,7 @@ class GeneralGrad {
           }
 
           if (meta.size() != 1 && IsNeededNodes(node) &&
-              !IsNeededNodes(next_node.get()) && !IsEnddingNodes(node)) {
+              !IsNeededNodes(next_node.get()) && !IsEndingNodes(node)) {
             VLOG(3) << "Get stop edge from grad_node: " << node->name() << " : "
                     << node << " to:" << next_node->name() << ", "
                     << next_node.get() << " with output rank info: " << i
@@ -448,8 +448,8 @@ class GeneralGrad {
       auto* target_node = auto_grad_meta->GetMutableGradNode().get();
       if (orig_to_copied_node_map_.count(target_node)) {
         target_node = orig_to_copied_node_map_[target_node].get();
-        if (copied_node_to_endding_node_map_.count(target_node)) {
-          target_node = copied_node_to_endding_node_map_[target_node].get();
+        if (copied_node_to_ending_node_map_.count(target_node)) {
+          target_node = copied_node_to_ending_node_map_[target_node].get();
         }
       } else {
         VLOG(6) << "Unable to find target node in "
@@ -480,7 +480,7 @@ class GeneralGrad {
 
   bool IsNeededNodes(GradNodeBase* node) { return needed_nodes_.count(node); }
 
-  bool IsEnddingNodes(GradNodeBase* node) { return endding_nodes_.count(node); }
+  bool IsEndingNodes(GradNodeBase* node) { return ending_nodes_.count(node); }
 
   bool IsInputTargetNodes(GradNodeBase* node) {
     auto iter = input_target_nodes_inputmeta_map_.find(node);
@@ -621,9 +621,9 @@ class GeneralGrad {
     results_map_.clear();
     copied_grad_nodes_.clear();
     orig_to_copied_node_map_.clear();
-    copied_node_to_endding_node_map_.clear();
+    copied_node_to_ending_node_map_.clear();
     needed_nodes_.clear();
-    endding_nodes_.clear();
+    ending_nodes_.clear();
   }
 
  private:
@@ -649,8 +649,8 @@ class GeneralGrad {
   std::unordered_set<GradNodeBase*> needed_nodes_;
   // Record which grad_node has been transformed to AccumulationNode
   std::unordered_map<GradNodeBase*, std::shared_ptr<GradNodeBase>>
-      copied_node_to_endding_node_map_;
-  std::unordered_set<GradNodeBase*> endding_nodes_;
+      copied_node_to_ending_node_map_;
+  std::unordered_set<GradNodeBase*> ending_nodes_;
 
   DISABLE_COPY_AND_ASSIGN(GeneralGrad);
 };
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 6da7f9f8c2041..7a137b050bed7 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -64,7 +64,9 @@ class OpHandleBase {
 
   virtual bool GetSkipRunning() const { return skip_running_; }
 
-  virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
+  virtual void SetSkipRunning(bool skip_running) {
+    skip_running_ = skip_running;
+  }
 
   virtual std::string Name() const = 0;
 

From ce5a3a85866e27606651c763c382cd7d60fc79f9 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Mon, 11 Mar 2024 12:52:33 +0800
Subject: [PATCH 110/114] support sharding stage 2 (#62486)

---
 python/paddle/distributed/__init__.py         |   2 +
 .../paddle/distributed/auto_parallel/api.py   | 132 ++++++++++++++++--
 .../semi_auto_parallel_sharding_stage_2.py    | 114 +++++++++++++++
 ..._auto_parallel_hybrid_sharding_strategy.py |  10 ++
 .../semi_auto_parallel_sharding_stage_2.py    | 100 +++++++++++++
 ...st_semi_auto_parallel_sharding_strategy.py |  10 ++
 6 files changed, 353 insertions(+), 15 deletions(-)
 create mode 100644 test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_2.py
 create mode 100644 test/auto_parallel/semi_auto_parallel_sharding_stage_2.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index feae03521c84b..58f8af1e37af8 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -87,6 +87,7 @@
     shard_optimizer,
     shard_scaler,
     ShardingStage1,
+    ShardingStage2,
     ShardingStage3,
     to_static,
     Strategy,
@@ -174,6 +175,7 @@
     "shard_optimizer",
     "shard_scaler",
     "ShardingStage1",
+    "ShardingStage2",
     "ShardingStage3",
     "to_static",
     "Strategy",
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index ada2958cdc57c..a12dd36849440 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -584,13 +584,14 @@ def get_placement_with_sharding(param, sharding_mesh_axis):
             # for example, [Shard(0), Shard(1)], assert here in case
             assert (
                 shard_axis == -1
-            ), "The parameter can't be shard twice even in different mesh now."
+            ), "The parameter can't be shard twice with sharding strategy even in different mesh now."
             shard_axis = placement.get_dim()
 
     placement_with_sharding = None
     for dim in range(param.ndim):
         if dim != shard_axis:
             placement_with_sharding = dist.Shard(dim)
+            break
 
     new_placements = param.placements
     if placement_with_sharding is not None:
@@ -626,10 +627,17 @@ def __init__(self, optimizer, shard_fn=None):
         self._sharding_mesh_axis = None
         self._sharding_degree = None
 
-        if isinstance(self._shard_fn, (ShardingStage1, ShardingStage3)):
+        if isinstance(
+            self._shard_fn, (ShardingStage1, ShardingStage2, ShardingStage3)
+        ):
             self._set_and_check_sharding_prop_from_param()
             self._shard_fn._set_sharding_mesh_axis(self._sharding_mesh_axis)
 
+        # Invoke register hook for sharding stage 2 strategy
+        if isinstance(self._shard_fn, ShardingStage2):
+            for param in self._inner_opt._parameter_list:
+                self._shard_fn._register_hook_for_param_grad(param)
+
         # Invoke shard_parameter in sharding stage 3 strategy
         if isinstance(self._shard_fn, ShardingStage3):
             for param in self._inner_opt._parameter_list:
@@ -835,10 +843,22 @@ def __getattr__(self, item):
         return getattr(self._inner_opt, item)
 
 
-class ShardingStage1:
+class _ShardingStageBase:
+    def __init__(self, mesh):
+        self._mesh = mesh
+        self._sharding_mesh_axis = None
+
+    def _set_sharding_mesh_axis(self, sharding_mesh_axis):
+        self._sharding_mesh_axis = sharding_mesh_axis
+
+
+class ShardingStage1(_ShardingStageBase):
     """
     A builtin shard_fn for shard_optimizer interface, users can pass it to shard_optimizer to implement sharding optimization with stage 1.
 
+    Args:
+        mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes.
+
     Examples:
         .. code-block:: python
 
@@ -860,7 +880,7 @@ class ShardingStage1:
             >>> layer = MLP()
             >>> batch = paddle.rand(shape=[8, 8])
             >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
-            >>> opt = dist.shard_optimizer(opt, dist.ShardingStage1())
+            >>> opt = dist.shard_optimizer(opt, dist.ShardingStage1(mesh))
             >>> for _ in range(5):
             >>>     loss = layer(batch)
             >>>     loss.backward()
@@ -871,8 +891,7 @@ class ShardingStage1:
     """
 
     def __init__(self, mesh):
-        self._mesh = mesh
-        self._sharding_mesh_axis = None
+        super().__init__(mesh)
 
     def __call__(self, key, param, accumulator):
         if param.is_dist():
@@ -893,11 +912,94 @@ def __call__(self, key, param, accumulator):
             )
         return accumulator
 
-    def _set_sharding_mesh_axis(self, sharding_mesh_axis):
-        self._sharding_mesh_axis = sharding_mesh_axis
 
+class ShardingStage2(_ShardingStageBase):
+    """
+    A builtin shard_fn for shard_optimizer interface, users can pass it to shard_optimizer to implement sharding optimization with stage 2.
+
+    Args:
+        mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
 
-class ShardingStage3:
+            >>> class MLP(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.fc1 = paddle.nn.Linear(8, 8)
+            ...         self.fc2 = paddle.nn.Linear(8, 8)
+            ...
+            ...     def forward(self, input):
+            ...         return self.fc2(self.fc1(input))
+
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> layer = MLP()
+            >>> batch = paddle.rand(shape=[8, 8])
+            >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
+            >>> opt = dist.shard_optimizer(opt, dist.ShardingStage2(mesh))
+            >>> for _ in range(5):
+            >>>     loss = layer(batch)
+            >>>     loss.backward()
+            >>>     opt.step()
+            >>>     opt.clear_grad()
+            >>> # This case need to be executed in multi-card environment
+            >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py
+    """
+
+    def __init__(self, mesh):
+        super().__init__(mesh)
+
+    def __call__(self, key, param, accumulator):
+        if param.is_dist():
+            # Only deal with momentum in optimizer, beta should be replicated cross param's mesh
+            if 'beta' not in key:
+                placements = get_placement_with_sharding(
+                    param, self._sharding_mesh_axis
+                )
+            else:
+                placements = [
+                    dist.Replicate()
+                    for _ in range(len(param.process_mesh.shape))
+                ]
+            return shard_tensor(
+                accumulator,
+                mesh=param.process_mesh,
+                placements=placements,
+            )
+        return accumulator
+
+    @staticmethod
+    def _grad_hook(grad):
+        # do reshard only if the grad is dist tensor and in partial status
+        if grad.is_dist():
+            partial_mesh_axis = None
+            for mesh_axis, placement in enumerate(grad.placements):
+                if isinstance(placement, dist.Partial):
+                    partial_mesh_axis = mesh_axis
+            if partial_mesh_axis is not None:
+                new_placements = get_placement_with_sharding(
+                    grad, partial_mesh_axis
+                )
+                return reshard(grad, grad.process_mesh, new_placements)
+
+        return grad
+
+    def _register_hook_for_param_grad(self, param):
+        if param.is_dense():
+            placements = []
+            for _ in range(len(self._mesh.shape)):
+                placements.append(dist.Replicate())
+            param._to_dist_(placements, self._mesh)
+
+        param.register_hook(ShardingStage2._grad_hook)
+
+
+class ShardingStage3(_ShardingStageBase):
     """
     A builtin shard_fn for shard_optimizer interface, users can pass it to shard_optimizer to implement sharding optimization with stage 3.
 
@@ -936,11 +1038,7 @@ class ShardingStage3:
     """
 
     def __init__(self, mesh):
-        self._mesh = mesh
-        self._sharding_mesh_axis = None
-
-    def _set_sharding_mesh_axis(self, sharding_mesh_axis):
-        self._sharding_mesh_axis = sharding_mesh_axis
+        super().__init__(mesh)
 
     def _shard_parameter(self, param):
         if param.is_dense():
@@ -2000,6 +2098,10 @@ def to_static(
                 strategy.sharding.enable = True
                 strategy.sharding.stage = 1
                 strategy.sharding.degree = sharding_degree
+            elif isinstance(shard_fn, ShardingStage2):
+                strategy.sharding.enable = True
+                strategy.sharding.stage = 2
+                strategy.sharding.degree = sharding_degree
             elif isinstance(shard_fn, ShardingStage3):
                 strategy.sharding.enable = True
                 strategy.sharding.stage = 3
@@ -2008,7 +2110,7 @@ def to_static(
                     shard_fn._unshard_parameter(param)
             else:
                 raise NotImplementedError(
-                    "Only sharding stage 1 and 3 can to_static for now. User-defined shard_fn and sharding stage 2 will be supported later."
+                    "Only sharding stage 1, 2 and 3 can to_static for now. User-defined shard_fn will be supported later."
                 )
 
     dist_model = DistModel(layer, loader, loss, optimizer, strategy)
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_2.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_2.py
new file mode 100644
index 0000000000000..a597e68ec4629
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_2.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+from auto_parallel.semi_auto_parallel_dist_to_static_api import (
+    DemoNet,
+    create_data_loader,
+)
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+
+
+class TestSemiAutoParallelShardingStage2:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+
+    def check_tensor_eq(self, a, b, rtol=1e-05, atol=0, verbose=True):
+        np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, verbose=verbose)
+
+    def shard_layer_fn(self, layer_name, layer, process_mesh):
+        layer.weight = dist.shard_tensor(
+            layer.weight, process_mesh, [dist.Shard(1)]
+        )
+        layer.bias = dist.shard_tensor(
+            layer.bias, process_mesh, [dist.Shard(0)]
+        )
+
+    def get_single_card_rst(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.weight = linear.weight.numpy()
+        self.bias = linear.bias.numpy()
+
+    def test_sharding_stage_2_with_mp(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        linear = dist.shard_layer(linear, self._mesh, self.shard_layer_fn)
+        batch = paddle.rand(shape=[10, 10])
+        # shard the input by sharding degree
+        batch = dist.shard_tensor(batch, self._mesh, [dist.Shard(0)])
+        # shard optimizer with stage 1 fn
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        opt = dist.shard_optimizer(opt, dist.ShardingStage2(self._mesh))
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.check_tensor_eq(self.weight, linear.weight.numpy())
+        self.check_tensor_eq(self.bias, linear.bias.numpy())
+
+    def test_sharding_stage_2_with_mp_to_static(self):
+        data_loader = create_data_loader()
+        layer = DemoNet(
+            self._mesh, "sharding_with_mp_demonet", shard_weight=True
+        )
+        opt = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=layer.parameters()
+        )
+        opt = dist.shard_optimizer(opt, dist.ShardingStage2(self._mesh))
+        loss_fn = nn.MSELoss()
+
+        dist_loader = dist.shard_dataloader(
+            dataloader=data_loader,
+            meshes=[self._mesh],
+            shard_dims=0,
+        )
+
+        dist_model = dist.to_static(layer, dist_loader, loss_fn, opt)
+
+        dist_model.train()
+        for epoch in range(2):
+            for batch_id, (image, label) in enumerate(dist_loader()):
+                loss = dist_model(image, label)
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.get_single_card_rst()
+        self.test_sharding_stage_2_with_mp()
+        self.test_sharding_stage_2_with_mp_to_static()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallelShardingStage2().run_test_case()
diff --git a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_sharding_strategy.py b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_sharding_strategy.py
index e358c18ba2a21..3ba3e83bdd81a 100644
--- a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_sharding_strategy.py
+++ b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_sharding_strategy.py
@@ -41,6 +41,16 @@ def test_sharding_stage_1_strategy(self):
                 user_defined_envs=envs,
             )
 
+    def test_sharding_stage_2_strategy(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_sharding_stage_2.py",
+                user_defined_envs=envs,
+            )
+
     def test_sharding_stage_3_strategy(self):
         envs_list = test_base.gen_product_envs_list(
             self._default_envs, self._changeable_envs
diff --git a/test/auto_parallel/semi_auto_parallel_sharding_stage_2.py b/test/auto_parallel/semi_auto_parallel_sharding_stage_2.py
new file mode 100644
index 0000000000000..29cfea8e0ab59
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_sharding_stage_2.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+from semi_auto_parallel_dist_to_static_api import DemoNet, create_data_loader
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+
+
+class TestSemiAutoParallelShardingStage2:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+    def check_tensor_eq(self, a, b, rtol=1e-05, atol=0, verbose=True):
+        np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, verbose=verbose)
+
+    def get_single_card_rst(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.weight = linear.weight.numpy()
+        self.bias = linear.bias.numpy()
+
+    def test_pure_sharding_stage_2(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        # shard the input by sharding degree
+        batch = dist.shard_tensor(batch, self._mesh, [dist.Shard(0)])
+        # shard optimizer with stage 2 fn
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        opt = dist.shard_optimizer(opt, dist.ShardingStage2(self._mesh))
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.check_tensor_eq(self.weight, linear.weight.numpy())
+        self.check_tensor_eq(self.bias, linear.bias.numpy())
+
+    def test_sharding_stage_2_to_static(self):
+        data_loader = create_data_loader()
+        layer = DemoNet(self._mesh, "sharding_demonet")
+        opt = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=layer.parameters()
+        )
+        opt = dist.shard_optimizer(opt, dist.ShardingStage2(self._mesh))
+        loss_fn = nn.MSELoss()
+
+        dist_loader = dist.shard_dataloader(
+            dataloader=data_loader,
+            meshes=[self._mesh],
+            shard_dims=0,
+        )
+
+        dist_model = dist.to_static(layer, dist_loader, loss_fn, opt)
+
+        dist_model.train()
+        for epoch in range(2):
+            for batch_id, (image, label) in enumerate(dist_loader()):
+                loss = dist_model(image, label)
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.get_single_card_rst()
+        self.test_pure_sharding_stage_2()
+        self.test_sharding_stage_2_to_static()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallelShardingStage2().run_test_case()
diff --git a/test/auto_parallel/test_semi_auto_parallel_sharding_strategy.py b/test/auto_parallel/test_semi_auto_parallel_sharding_strategy.py
index 489cba334c1b0..8886df085ee56 100644
--- a/test/auto_parallel/test_semi_auto_parallel_sharding_strategy.py
+++ b/test/auto_parallel/test_semi_auto_parallel_sharding_strategy.py
@@ -41,6 +41,16 @@ def test_sharding_stage_1_strategy(self):
                 user_defined_envs=envs,
             )
 
+    def test_sharding_stage_2_strategy(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_sharding_stage_2.py",
+                user_defined_envs=envs,
+            )
+
     def test_sharding_stage_3_strategy(self):
         envs_list = test_base.gen_product_envs_list(
             self._default_envs, self._changeable_envs

From 0942bbc2ce7984e809cb135f9059b6f990e97311 Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Mon, 11 Mar 2024 12:52:42 +0800
Subject: [PATCH 111/114] fix small reduce in tile first schedule (#62593)

---
 .../tactic/tile_first_general_tactic.cc       | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
index 035a59ae9582c..173404060f6fa 100644
--- a/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/cinn/ir/group_schedule/tactic/tile_first_general_tactic.h"
+#include "paddle/cinn/adt/adt.h"
+#include "paddle/cinn/common/integer_set.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
@@ -219,6 +221,22 @@ void TileFirstGeneralTactic::SplitWarpNumber(ir::IRSchedule* sch,
   };
   if (!IsWarpNumGT(1)) return;
 
+  const auto LimitWarpNum = [&](const std::shared_ptr<GroupTileInfo>& tile_info,
+                                const ir::Expr& loop) {
+    ir::Expr extent = loop.As<ir::For>()->extent;
+    common::cas_intervals_t var_intervals =
+        common::CollectVarIntervalsOfExprs({extent});
+    common::SymbolicExprAnalyzer analyzer(var_intervals);
+    const auto& proved_gt =
+        analyzer.ProveGT(ir::Expr(tile_info->warp_num), extent);
+    if (proved_gt.value_or(false)) {
+      ir::Expr upper_bound = analyzer.UpperBound(extent);
+      if (upper_bound.is_constant()) {
+        tile_info->warp_num = upper_bound.get_constant();
+      }
+    }
+  };
+
   if (!HasReduceAxis(context_->group_tile_info)) {
     // get num warp from flatten num
     auto loops = sch->GetLoops(block_id);
@@ -228,6 +246,7 @@ void TileFirstGeneralTactic::SplitWarpNumber(ir::IRSchedule* sch,
   } else if (IsInnerThreadSpatialLoopGT(context_->group_tile_info, 1)) {
     // get num warp from flatten num
     auto loops = sch->GetLoops(block_id);
+    LimitWarpNum(context_->group_tile_info, loops[0]);
     sch->Split(loops[0],
                std::vector<int>({-1, context_->group_tile_info->warp_num}));
 

From 280045c072f4edcaa691b2e43df4492bdbce3510 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Mon, 11 Mar 2024 13:19:01 +0800
Subject: [PATCH 112/114] fix loop reorder alignment tactic bug (#62581)

---
 .../ir/group_schedule/tactic/loop_reorder_alignment_tactic.cc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/cinn/ir/group_schedule/tactic/loop_reorder_alignment_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/loop_reorder_alignment_tactic.cc
index 39bf104e56508..3b8718ddf5815 100644
--- a/paddle/cinn/ir/group_schedule/tactic/loop_reorder_alignment_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/loop_reorder_alignment_tactic.cc
@@ -173,7 +173,7 @@ void LoopReorderAlignmentTactic::DoReorder(ir::IRSchedule* sch,
   const auto IsReduceBlock = [&](const std::string& block_id) {
     return context_->group_tile_info->reduce_tensor_names.count(block_id) > 0;
   };
-  if (!IsReduceBlock(block_id)) {
+  if (IsReduceBlock(block_id)) {
     return;
   }
 

From a5f76154c045cf7f37eb6ce59dc4f72fd29f4c93 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Mon, 11 Mar 2024 13:51:57 +0800
Subject: [PATCH 113/114] [PIR]Split test_zeros_dim_tensor.py to 10 unittest
 files (#62527)

* split test_zeors_dim_tensor

* split sundry api
---
 test/legacy_test/test_zero_dim_binary_api.py  |  353 +
 test/legacy_test/test_zero_dim_complex_api.py |  173 +
 .../test_zero_dim_distribution_loss_api.py    |  375 +
 .../test_zero_dim_no_backward_api.py          |  578 ++
 test/legacy_test/test_zero_dim_reduce_api.py  |  266 +
 .../test_zero_dim_sundry_dygraph_api.py       | 2356 ++++++
 .../test_zero_dim_sundry_static_api_part1.py  |  916 +++
 .../test_zero_dim_sundry_static_api_part2.py  | 1030 +++
 .../test_zero_dim_sundry_static_api_part3.py  |  990 +++
 test/legacy_test/test_zero_dim_tensor.py      | 6935 -----------------
 test/legacy_test/test_zero_dim_unary_api.py   |  185 +
 tools/windows/run_unittests.sh                |    6 +-
 12 files changed, 7227 insertions(+), 6936 deletions(-)
 create mode 100644 test/legacy_test/test_zero_dim_binary_api.py
 create mode 100644 test/legacy_test/test_zero_dim_complex_api.py
 create mode 100644 test/legacy_test/test_zero_dim_distribution_loss_api.py
 create mode 100644 test/legacy_test/test_zero_dim_no_backward_api.py
 create mode 100644 test/legacy_test/test_zero_dim_reduce_api.py
 create mode 100644 test/legacy_test/test_zero_dim_sundry_dygraph_api.py
 create mode 100644 test/legacy_test/test_zero_dim_sundry_static_api_part1.py
 create mode 100644 test/legacy_test/test_zero_dim_sundry_static_api_part2.py
 create mode 100644 test/legacy_test/test_zero_dim_sundry_static_api_part3.py
 delete mode 100644 test/legacy_test/test_zero_dim_tensor.py
 create mode 100644 test/legacy_test/test_zero_dim_unary_api.py

diff --git a/test/legacy_test/test_zero_dim_binary_api.py b/test/legacy_test/test_zero_dim_binary_api.py
new file mode 100644
index 0000000000000..fc6fcb14aba3b
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_binary_api.py
@@ -0,0 +1,353 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+binary_api_list = [
+    {'func': paddle.add, 'cls_method': '__add__'},
+    {'func': paddle.subtract, 'cls_method': '__sub__'},
+    {'func': paddle.multiply, 'cls_method': '__mul__'},
+    {'func': paddle.divide, 'cls_method': '__div__'},
+    {'func': paddle.pow, 'cls_method': '__pow__'},
+    {'func': paddle.equal, 'cls_method': '__eq__'},
+    {'func': paddle.not_equal, 'cls_method': '__ne__'},
+    {'func': paddle.greater_equal, 'cls_method': '__ge__'},
+    {'func': paddle.greater_than, 'cls_method': '__gt__'},
+    {'func': paddle.less_equal, 'cls_method': '__le__'},
+    {'func': paddle.less_than, 'cls_method': '__lt__'},
+    {'func': paddle.remainder, 'cls_method': '__mod__'},
+    paddle.mod,
+    paddle.floor_mod,
+    paddle.logical_and,
+    paddle.logical_or,
+    paddle.logical_xor,
+    paddle.maximum,
+    paddle.minimum,
+    paddle.fmax,
+    paddle.fmin,
+    paddle.complex,
+    paddle.kron,
+    paddle.logaddexp,
+    paddle.nextafter,
+    paddle.ldexp,
+    paddle.polar,
+    paddle.heaviside,
+]
+
+binary_int_api_list = [
+    paddle.bitwise_and,
+    paddle.bitwise_or,
+    paddle.bitwise_xor,
+    paddle.gcd,
+    paddle.lcm,
+]
+
+
+inplace_binary_api_list = [
+    paddle.tensor.add_,
+    paddle.tensor.subtract_,
+    paddle.tensor.multiply_,
+    paddle.tensor.remainder_,
+    paddle.tensor.remainder_,
+]
+
+
+# Use to test zero-dim of binary API
+class TestBinaryAPI(unittest.TestCase):
+    def test_dygraph_binary(self):
+        paddle.disable_static()
+        for api in binary_api_list:
+            # 1) x is 0D, y is 0D
+            x = paddle.rand([])
+            y = paddle.rand([])
+            x.stop_gradient = False
+            y.stop_gradient = False
+            if isinstance(api, dict):
+                out = api['func'](x, y)
+                out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
+                np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
+            else:
+                out = api(x, y)
+
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(x.shape, [])
+            self.assertEqual(y.shape, [])
+            self.assertEqual(out.shape, [])
+            if x.grad is not None:
+                self.assertEqual(x.grad.shape, [])
+                self.assertEqual(y.grad.shape, [])
+                self.assertEqual(out.grad.shape, [])
+
+            # 2) x is ND, y is 0D
+            x = paddle.rand([2, 3, 4])
+            y = paddle.rand([])
+            x.stop_gradient = False
+            y.stop_gradient = False
+            if isinstance(api, dict):
+                out = api['func'](x, y)
+                out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
+                np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
+            else:
+                out = api(x, y)
+
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(x.shape, [2, 3, 4])
+            self.assertEqual(y.shape, [])
+            self.assertEqual(out.shape, [2, 3, 4])
+            if x.grad is not None:
+                self.assertEqual(x.grad.shape, [2, 3, 4])
+                self.assertEqual(y.grad.shape, [])
+                self.assertEqual(out.grad.shape, [2, 3, 4])
+
+            # 3) x is 0D , y is ND
+            x = paddle.rand([])
+            y = paddle.rand([2, 3, 4])
+            x.stop_gradient = False
+            y.stop_gradient = False
+            if isinstance(api, dict):
+                out = api['func'](x, y)
+                out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
+                np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
+            else:
+                out = api(x, y)
+
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(x.shape, [])
+            self.assertEqual(y.shape, [2, 3, 4])
+            self.assertEqual(out.shape, [2, 3, 4])
+            if x.grad is not None:
+                self.assertEqual(x.grad.shape, [])
+                self.assertEqual(y.grad.shape, [2, 3, 4])
+                self.assertEqual(out.grad.shape, [2, 3, 4])
+
+            # 4) x is 0D , y is scalar
+            x = paddle.rand([])
+            x.stop_gradient = False
+            y = 0.5
+            if isinstance(api, dict):
+                out = getattr(paddle.Tensor, api['cls_method'])(x, y)
+
+                out.retain_grads()
+                out.backward()
+
+                self.assertEqual(x.shape, [])
+                self.assertEqual(out.shape, [])
+                if x.grad is not None:
+                    self.assertEqual(x.grad.shape, [])
+                    self.assertEqual(out.grad.shape, [])
+
+        for api in binary_int_api_list:
+            # 1) x is 0D, y is 0D
+            x_np = np.random.randint(-10, 10, [])
+            y_np = np.random.randint(-10, 10, [])
+            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+
+            x = paddle.to_tensor(x_np)
+            y = paddle.to_tensor(y_np)
+            out = api(x, y)
+
+            self.assertEqual(out.shape, [])
+            np.testing.assert_array_equal(out.numpy(), out_np)
+
+            # 2) x is ND, y is 0D
+            x_np = np.random.randint(-10, 10, [3, 5])
+            y_np = np.random.randint(-10, 10, [])
+            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+
+            x = paddle.to_tensor(x_np)
+            y = paddle.to_tensor(y_np)
+            out = api(x, y)
+
+            self.assertEqual(out.shape, [3, 5])
+            np.testing.assert_array_equal(out.numpy(), out_np)
+
+            # 3) x is 0D , y is ND
+            x_np = np.random.randint(-10, 10, [])
+            y_np = np.random.randint(-10, 10, [3, 5])
+            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
+
+            x = paddle.to_tensor(x_np)
+            y = paddle.to_tensor(y_np)
+            out = api(x, y)
+
+            self.assertEqual(out.shape, [3, 5])
+            np.testing.assert_array_equal(out.numpy(), out_np)
+
+        for api in inplace_binary_api_list:
+            with paddle.no_grad():
+                x = paddle.rand([])
+                y = paddle.rand([])
+                out = api(x, y)
+                self.assertEqual(x.shape, [])
+                self.assertEqual(out.shape, [])
+
+                x = paddle.rand([3, 5])
+                y = paddle.rand([])
+                out = api(x, y)
+                self.assertEqual(x.shape, [3, 5])
+                self.assertEqual(out.shape, [3, 5])
+
+        paddle.enable_static()
+
+    def test_static_binary(self):
+        paddle.enable_static()
+        for api in binary_api_list:
+            main_prog = paddle.static.Program()
+            block = main_prog.global_block()
+            with paddle.static.program_guard(
+                main_prog, paddle.static.Program()
+            ):
+                # 1) x is 0D, y is 0D
+                x = paddle.rand([])
+                y = paddle.rand([])
+                x.stop_gradient = False
+                y.stop_gradient = False
+                if isinstance(api, dict):
+                    out = api['func'](x, y)
+                    out_cls = getattr(
+                        paddle.static.Variable, api['cls_method']
+                    )(x, y)
+                    self.assertEqual(out.shape, out_cls.shape)
+                else:
+                    out = api(x, y)
+                paddle.static.append_backward(out)
+
+                self.assertEqual(x.shape, ())
+                self.assertEqual(y.shape, ())
+                self.assertEqual(out.shape, ())
+                if block.has_var(x.grad_name):
+                    out_grad = block.var(out.grad_name)
+                    x_grad = block.var(x.grad_name)
+                    y_grad = block.var(y.grad_name)
+
+                    self.assertEqual(x_grad.shape, ())
+                    self.assertEqual(y_grad.shape, ())
+                    self.assertEqual(out_grad.shape, ())
+
+                # 2) x is 0D, y is ND
+                x = paddle.rand([])
+                y = paddle.rand([2, 3, 4])
+                x.stop_gradient = False
+                y.stop_gradient = False
+                if isinstance(api, dict):
+                    out = api['func'](x, y)
+                    out_cls = getattr(
+                        paddle.static.Variable, api['cls_method']
+                    )(x, y)
+                    self.assertEqual(out.shape, out_cls.shape)
+                else:
+                    out = api(x, y)
+                paddle.static.append_backward(out)
+
+                self.assertEqual(x.shape, ())
+                self.assertEqual(y.shape, (2, 3, 4))
+                self.assertEqual(out.shape, (2, 3, 4))
+                if block.has_var(x.grad_name):
+                    out_grad = block.var(out.grad_name)
+                    x_grad = block.var(x.grad_name)
+                    y_grad = block.var(y.grad_name)
+
+                    self.assertEqual(x_grad.shape, ())
+                    self.assertEqual(y_grad.shape, (2, 3, 4))
+                    self.assertEqual(out_grad.shape, (2, 3, 4))
+
+                # 3) x is ND, y is 0d
+                x = paddle.rand([2, 3, 4])
+                y = paddle.rand([])
+                x.stop_gradient = False
+                y.stop_gradient = False
+                if isinstance(api, dict):
+                    out = api['func'](x, y)
+                    out_cls = getattr(
+                        paddle.static.Variable, api['cls_method']
+                    )(x, y)
+                    self.assertEqual(out.shape, out_cls.shape)
+                else:
+                    out = api(x, y)
+                paddle.static.append_backward(out)
+
+                self.assertEqual(x.shape, (2, 3, 4))
+                self.assertEqual(y.shape, ())
+                self.assertEqual(out.shape, (2, 3, 4))
+                if block.has_var(x.grad_name):
+                    out_grad = block.var(out.grad_name)
+                    x_grad = block.var(x.grad_name)
+                    y_grad = block.var(y.grad_name)
+
+                    self.assertEqual(x_grad.shape, (2, 3, 4))
+                    self.assertEqual(y_grad.shape, ())
+                    self.assertEqual(out_grad.shape, (2, 3, 4))
+
+                # 4) x is 0D , y is scalar
+                x = paddle.rand([])
+                x.stop_gradient = False
+                y = 0.5
+                if isinstance(api, dict):
+                    out = getattr(paddle.static.Variable, api['cls_method'])(
+                        x, y
+                    )
+                    paddle.static.append_backward(out)
+
+                    self.assertEqual(x.shape, ())
+                    self.assertEqual(out.shape, ())
+                    if block.has_var(x.grad_name):
+                        out_grad = block.var(out.grad_name)
+                        x_grad = block.var(x.grad_name)
+
+                        self.assertEqual(out_grad.shape, ())
+                        self.assertEqual(x_grad.shape, ())
+
+        for api in binary_int_api_list:
+            main_prog = paddle.static.Program()
+            with paddle.static.program_guard(
+                main_prog, paddle.static.Program()
+            ):
+                # 1) x is 0D, y is 0D
+                x = paddle.randint(-10, 10, [])
+                y = paddle.randint(-10, 10, [])
+                out = api(x, y)
+                self.assertEqual(out.shape, ())
+
+                # 2) x is ND , y is 0D
+                x = paddle.randint(-10, 10, [3, 5])
+                y = paddle.randint(-10, 10, [])
+                out = api(x, y)
+                self.assertEqual(out.shape, (3, 5))
+
+                # 3) x is 0D , y is ND
+                x = paddle.randint(-10, 10, [])
+                y = paddle.randint(-10, 10, [3, 5])
+                out = api(x, y)
+                self.assertEqual(out.shape, (3, 5))
+
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_complex_api.py b/test/legacy_test/test_zero_dim_complex_api.py
new file mode 100644
index 0000000000000..8bf977f0bbf8e
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_complex_api.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import paddle
+
+unary_apis_with_complex_input = [
+    paddle.real,
+    paddle.imag,
+    paddle.angle,
+    paddle.conj,
+]
+
+
+class TestUnaryElementwiseAPIWithComplexInput(unittest.TestCase):
+    def test_dygraph_unary(self):
+        paddle.disable_static()
+        for api in unary_apis_with_complex_input:
+            x = paddle.rand([]) + 1j * paddle.rand([])
+            x.stop_gradient = False
+            x.retain_grads()
+            out = api(x)
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(x.shape, [])
+            self.assertEqual(out.shape, [])
+            if x.grad is not None:
+                self.assertEqual(x.grad.shape, [])
+                self.assertEqual(out.grad.shape, [])
+
+        paddle.enable_static()
+
+    def test_static_unary(self):
+        paddle.enable_static()
+        for api in unary_apis_with_complex_input:
+            main_prog = paddle.static.Program()
+            block = main_prog.global_block()
+            exe = paddle.static.Executor()
+            with paddle.static.program_guard(
+                main_prog, paddle.static.Program()
+            ):
+                x = paddle.complex(paddle.rand([]), paddle.rand([]))
+                x.stop_gradient = False
+                out = api(x)
+                paddle.static.append_backward(out)
+
+                fetch_list = [x, out]
+                if block.has_var(x.grad_name):
+                    fetch_list.extend([x.grad_name, out.grad_name])
+
+                # 1) Test Program
+                res = exe.run(main_prog, fetch_list=fetch_list)
+                for item in res:
+                    self.assertEqual(item.shape, ())
+
+                # 2) Test CompiledProgram Program
+                compile_prog = paddle.static.CompiledProgram(main_prog)
+                res = exe.run(compile_prog, fetch_list=fetch_list)
+                for item in res:
+                    self.assertEqual(item.shape, ())
+
+        paddle.disable_static()
+
+
+class TestAsReal(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.rand([]) + 1j * paddle.rand([])
+        x.stop_gradient = False
+        x.retain_grads()
+        out = paddle.as_real(x)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [2])
+        if x.grad is not None:
+            self.assertEqual(x.grad.shape, [])
+            self.assertEqual(out.grad.shape, [2])
+
+        paddle.enable_static()
+
+    def test_static(self):
+        paddle.enable_static()
+
+        main_prog = paddle.static.Program()
+        block = main_prog.global_block()
+        exe = paddle.static.Executor()
+        with paddle.static.program_guard(main_prog, paddle.static.Program()):
+            x = paddle.complex(paddle.rand([]), paddle.rand([]))
+            x.stop_gradient = False
+            out = paddle.as_real(x)
+            self.assertEqual(x.shape, ())
+            self.assertEqual(out.shape, (2,))
+            paddle.static.append_backward(out.sum())
+
+            fetch_list = [x, out]
+            if block.has_var(x.grad_name):
+                fetch_list.extend([x.grad_name, out.grad_name])
+
+            res = exe.run(main_prog, fetch_list=fetch_list)
+            self.assertEqual(res[0].shape, ())
+            self.assertEqual(res[1].shape, (2,))
+            self.assertEqual(res[2].shape, ())
+            self.assertEqual(res[3].shape, (2,))
+
+        paddle.disable_static()
+
+
+class TestAsComplex(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.rand([2])
+        x.stop_gradient = False
+        x.retain_grads()
+        out = paddle.as_complex(x)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.shape, [2])
+        self.assertEqual(out.shape, [])
+        if x.grad is not None:
+            self.assertEqual(x.grad.shape, [2])
+            self.assertEqual(out.grad.shape, [])
+
+        paddle.enable_static()
+
+    def test_static(self):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        block = main_prog.global_block()
+        exe = paddle.static.Executor()
+        with paddle.static.program_guard(main_prog, paddle.static.Program()):
+            x = paddle.rand([2])
+            x.stop_gradient = False
+            out = paddle.as_complex(x)
+            self.assertEqual(x.shape, (2,))
+            self.assertEqual(out.shape, ())
+            paddle.static.append_backward(out.sum())
+
+            fetch_list = [x, out]
+            if block.has_var(x.grad_name):
+                fetch_list.extend([x.grad_name, out.grad_name])
+
+            res = exe.run(main_prog, fetch_list=fetch_list)
+            self.assertEqual(res[0].shape, (2,))
+            self.assertEqual(res[1].shape, ())
+            self.assertEqual(res[2].shape, (2,))
+            self.assertEqual(res[3].shape, ())
+
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_distribution_loss_api.py b/test/legacy_test/test_zero_dim_distribution_loss_api.py
new file mode 100644
index 0000000000000..128846e38bb7e
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_distribution_loss_api.py
@@ -0,0 +1,375 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import numpy as np
+from decorator_helper import prog_scope
+
+import paddle
+import paddle.nn.functional as F
+
+
+class TestDistribution(unittest.TestCase):
+    def setUp(self):
+        self.x = paddle.full([], 2.0)
+
+    def test_Bernoulli(self):
+        d = paddle.distribution.Bernoulli(probs=0.3)
+        self.assertEqual(d.mean.shape, [])
+        self.assertEqual(d.variance.shape, [])
+        self.assertEqual(d.entropy().shape, [])
+        self.assertEqual(d.sample([]).shape, [])
+        self.assertEqual(d.rsample([]).shape, [])
+        self.assertEqual(d.cdf(self.x).shape, [])
+        self.assertEqual(d.prob(self.x).shape, [])
+        self.assertEqual(d.log_prob(self.x).shape, [])
+
+        d_other = paddle.distribution.Bernoulli(probs=0.7)
+        self.assertEqual(d.kl_divergence(d_other).shape, [])
+
+    def test_Geometric(self):
+        d = paddle.distribution.Geometric(0.5)
+        self.assertEqual(d.mean.shape, [])
+        self.assertEqual(d.variance.shape, [])
+        self.assertEqual(d.entropy().shape, [])
+        self.assertEqual(d.stddev.shape, [])
+        self.assertEqual(d.pmf(self.x).shape, [])
+        self.assertEqual(d.log_pmf(self.x).shape, [])
+        self.assertEqual(d.sample([]).shape, [])
+        self.assertEqual(d.rsample([]).shape, [])
+        self.assertEqual(d.cdf(self.x).shape, [])
+
+        d_other = paddle.distribution.Geometric(probs=0.7)
+        self.assertEqual(d.kl_divergence(d_other).shape, [])
+
+    def test_Cauchy(self):
+        d = paddle.distribution.Cauchy(loc=0.1, scale=1.2)
+        self.assertEqual(d.sample([]).shape, [])
+        self.assertEqual(d.rsample([]).shape, [])
+        self.assertEqual(d.prob(self.x).shape, [])
+        self.assertEqual(d.log_prob(self.x).shape, [])
+        self.assertEqual(d.cdf(self.x).shape, [])
+        self.assertEqual(d.entropy().shape, [])
+
+        d_other = paddle.distribution.Cauchy(
+            loc=paddle.to_tensor(1.2), scale=paddle.to_tensor(2.3)
+        )
+        self.assertEqual(d.kl_divergence(d_other).shape, [])
+
+    def test_Categorical(self):
+        logits = paddle.rand([6])
+        d = paddle.distribution.Categorical(logits)
+        self.assertEqual(d.sample([]).shape, [])
+        self.assertEqual(d.probs(paddle.full([], 2, dtype='int64')).shape, [])
+        self.assertEqual(
+            d.log_prob(paddle.full([], 2, dtype='int64')).shape, []
+        )
+        self.assertEqual(d.entropy().shape, [])
+
+    def test_Normal(self):
+        normal = paddle.distribution.Normal(0.0, 3.0)
+        self.assertEqual(normal.sample([]).shape, [])
+        self.assertEqual(normal.rsample([]).shape, [])
+        self.assertEqual(normal.mean.shape, [])
+        self.assertEqual(normal.variance.shape, [])
+        self.assertEqual(normal.probs(self.x).shape, [])
+        self.assertEqual(normal.log_prob(self.x).shape, [])
+        self.assertEqual(normal.entropy().shape, [])
+
+        normal = paddle.distribution.Normal(
+            paddle.full([], 0.0), paddle.full([], 3.0)
+        )
+        self.assertEqual(normal.sample([]).shape, [])
+        self.assertEqual(normal.rsample([]).shape, [])
+        self.assertEqual(normal.mean.shape, [])
+        self.assertEqual(normal.variance.shape, [])
+        self.assertEqual(normal.probs(self.x).shape, [])
+        self.assertEqual(normal.log_prob(self.x).shape, [])
+        self.assertEqual(normal.entropy().shape, [])
+
+    def test_Uniform(self):
+        uniform = paddle.distribution.Uniform(0.0, 1.0)
+        self.assertEqual(uniform.sample([]).shape, [])
+        self.assertEqual(uniform.probs(self.x).shape, [])
+        self.assertEqual(uniform.log_prob(self.x).shape, [])
+        self.assertEqual(uniform.entropy().shape, [])
+
+        uniform = paddle.distribution.Uniform(
+            paddle.full([], 0.0), paddle.full([], 1.0)
+        )
+        self.assertEqual(uniform.sample([]).shape, [])
+        self.assertEqual(uniform.probs(self.x).shape, [])
+        self.assertEqual(uniform.log_prob(self.x).shape, [])
+        self.assertEqual(uniform.entropy().shape, [])
+
+    def test_Beta(self):
+        beta = paddle.distribution.Beta(alpha=0.5, beta=0.5)
+        self.assertEqual(beta.sample([]).shape, [])
+        self.assertEqual(beta.mean.shape, [])
+        self.assertEqual(beta.variance.shape, [])
+        self.assertEqual(beta.prob(self.x).shape, [])
+        self.assertEqual(beta.log_prob(self.x).shape, [])
+        self.assertEqual(beta.entropy().shape, [])
+
+    def test_kl_divergence(self):
+        p = paddle.distribution.Beta(alpha=0.5, beta=0.5)
+        q = paddle.distribution.Beta(alpha=0.2, beta=1.0)
+        kl = paddle.distribution.kl_divergence(p, q)
+        self.assertEqual(kl.shape, [])
+
+    def test_TransformedDistribution(self):
+        d = paddle.distribution.TransformedDistribution(
+            paddle.distribution.Normal(0.0, 1.0),
+            [
+                paddle.distribution.AffineTransform(
+                    paddle.full([], 1.0), paddle.full([], 2.0)
+                )
+            ],
+        )
+        self.assertEqual(d.sample([]).shape, [])
+        self.assertEqual(d.rsample([]).shape, [])
+        self.assertEqual(d.prob(self.x).shape, [])
+        self.assertEqual(d.log_prob(self.x).shape, [])
+
+    def test_Laplace(self):
+        d = paddle.distribution.Laplace(0.0, 1.0)
+        self.assertEqual(d.sample([]).shape, [])
+        self.assertEqual(d.rsample([]).shape, [])
+        self.assertEqual(d.mean.shape, [])
+        self.assertEqual(d.stddev.shape, [])
+        self.assertEqual(d.variance.shape, [])
+        self.assertEqual(d.prob(self.x).shape, [])
+        self.assertEqual(d.log_prob(self.x).shape, [])
+        self.assertEqual(d.cdf(self.x).shape, [])
+        self.assertEqual(d.icdf(self.x).shape, [])
+        self.assertEqual(d.entropy().shape, [])
+
+    def test_LogNormal(self):
+        d = paddle.distribution.LogNormal(0.0, 1.0)
+        self.assertEqual(d.sample([]).shape, [])
+        self.assertEqual(d.mean.shape, [])
+        self.assertEqual(d.variance.shape, [])
+        self.assertEqual(d.entropy().shape, [])
+        self.assertEqual(d.probs(self.x).shape, [])
+
+    def test_Gumbel(self):
+        d = paddle.distribution.Gumbel(0.0, 1.0)
+        self.assertEqual(d.sample([]).shape, [])
+        self.assertEqual(d.rsample([]).shape, [])
+        self.assertEqual(d.mean.shape, [])
+        self.assertEqual(d.variance.shape, [])
+        self.assertEqual(d.stddev.shape, [])
+        self.assertEqual(d.prob(self.x).shape, [])
+        self.assertEqual(d.log_prob(self.x).shape, [])
+        self.assertEqual(d.cdf(self.x).shape, [])
+        self.assertEqual(d.entropy().shape, [])
+
+    def test_Multinomial(self):
+        d = paddle.distribution.Multinomial(
+            10, paddle.to_tensor([0.2, 0.3, 0.5])
+        )
+        self.assertEqual(d.prob(self.x).shape, [])
+        self.assertEqual(d.log_prob(self.x).shape, [])
+        self.assertEqual(d.entropy().shape, [])
+
+
+class TestLossAPI(unittest.TestCase):
+    def test_sigmoid_focal_loss(self):
+        logit = paddle.to_tensor(
+            [[0.97, 0.91, 0.03], [0.55, 0.43, 0.71]],
+            dtype='float32',
+            stop_gradient=False,
+        )
+        logit.retain_grads()
+        label = paddle.to_tensor(
+            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32'
+        )
+        fg_num_0 = paddle.full([], 2.0)
+        fg_num_1 = paddle.full([1], 2.0)
+
+        out0 = F.sigmoid_focal_loss(
+            logit, label, normalizer=fg_num_0, reduction='sum'
+        )
+        out1 = F.sigmoid_focal_loss(
+            logit, label, normalizer=fg_num_1, reduction='sum'
+        )
+        out0.retain_grads()
+
+        np.testing.assert_array_equal(
+            out0.numpy(),
+            out1.numpy(),
+        )
+
+        out0.backward()
+        self.assertEqual(out0.shape, [])
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out0.grad.shape, [])
+        self.assertEqual(logit.grad.shape, [2, 3])
+
+    def test_cross_entropy(self):
+        input = paddle.rand([3, 5])
+        input.stop_gradient = False
+        label = paddle.randint(0, 5, shape=[3])
+
+        loss = paddle.nn.functional.cross_entropy(input, label, reduction='sum')
+        loss.backward()
+
+        self.assertEqual(loss.shape, [])
+        self.assertEqual(input.grad.shape, [3, 5])
+
+    def test_l1_loss(self):
+        input = paddle.rand([3, 5])
+        input.stop_gradient = False
+        label = paddle.rand([3, 5])
+
+        loss = paddle.nn.functional.l1_loss(input, label, reduction='mean')
+        loss.backward()
+
+        self.assertEqual(loss.shape, [])
+        self.assertEqual(input.grad.shape, [3, 5])
+
+    def test_nll_loss(self):
+        input = paddle.rand([5, 3])
+        input.stop_gradient = False
+        log_softmax = paddle.nn.LogSoftmax(axis=1)
+        log_out = log_softmax(input)
+        label = paddle.randint(0, 3, [5], "int64")
+
+        loss = paddle.nn.functional.nll_loss(log_out, label)
+        loss.backward()
+
+        self.assertEqual(loss.shape, [])
+        self.assertEqual(input.grad.shape, [5, 3])
+
+        input = paddle.rand([5, 3, 2, 4])
+        input.stop_gradient = False
+        log_softmax = paddle.nn.LogSoftmax(axis=1)
+        log_out = log_softmax(input)
+        label = paddle.randint(0, 3, [5, 2, 4], "int64")
+
+        loss = paddle.nn.functional.nll_loss(log_out, label)
+        loss.backward()
+
+        self.assertEqual(loss.shape, [])
+        self.assertEqual(input.grad.shape, [5, 3, 2, 4])
+
+
+class TestLossAPIStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.exe = paddle.static.Executor()
+
+    @prog_scope()
+    def test_sigmoid_focal_loss(self):
+        logit = paddle.rand([2, 3])
+        logit.stop_gradient = False
+
+        label = paddle.randint(0, 1, [2, 3]).astype('float32')
+        label.stop_gradient = False
+
+        fg_num_0 = paddle.full([], 2.0)
+        fg_num_1 = paddle.full([1], 2.0)
+
+        out0 = F.sigmoid_focal_loss(
+            logit, label, normalizer=fg_num_0, reduction='mean'
+        )
+        out1 = F.sigmoid_focal_loss(
+            logit, label, normalizer=fg_num_1, reduction='mean'
+        )
+        paddle.static.append_backward(out0.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog, fetch_list=[out0, out1, out0.grad_name, logit.grad_name]
+        )
+        np.testing.assert_allclose(res[0], res[1])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, (2, 3))
+
+    @prog_scope()
+    def test_cross_entropy(self):
+        input = paddle.rand([3, 5])
+        input.stop_gradient = False
+        label = paddle.randint(0, 5, shape=[3])
+        label.stop_gradient = False
+
+        loss = paddle.nn.functional.cross_entropy(
+            input, label, reduction='mean'
+        )
+        paddle.static.append_backward(loss)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[loss, input.grad_name])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 5))
+
+    @prog_scope()
+    def test_l1_loss(self):
+        input = paddle.rand([3, 5])
+        input.stop_gradient = False
+        label = paddle.rand([3, 5])
+
+        loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
+        paddle.static.append_backward(loss)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[loss, input.grad_name])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 5))
+
+    @prog_scope()
+    def test_nll_loss(self):
+        input = paddle.rand([5, 3])
+        input.stop_gradient = False
+        log_softmax = paddle.nn.LogSoftmax(axis=1)
+        log_out = log_softmax(input)
+
+        label = paddle.randint(0, 3, shape=[5])
+        label.stop_gradient = False
+
+        loss = paddle.nn.functional.nll_loss(log_out, label)
+        paddle.static.append_backward(loss)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[loss, input.grad_name])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (5, 3))
+
+        input = paddle.rand([5, 3, 2, 4])
+        input.stop_gradient = False
+        log_softmax = paddle.nn.LogSoftmax(axis=1)
+        log_out = log_softmax(input)
+
+        label = paddle.randint(0, 3, shape=[5, 2, 4])
+        label.stop_gradient = False
+
+        loss = paddle.nn.functional.nll_loss(log_out, label)
+        paddle.static.append_backward(loss)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[loss, input.grad_name])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (5, 3, 2, 4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_no_backward_api.py b/test/legacy_test/test_zero_dim_no_backward_api.py
new file mode 100644
index 0000000000000..1269ad4500920
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_no_backward_api.py
@@ -0,0 +1,578 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.pir_utils import test_with_pir_api
+
+
+# Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
+class TestNoBackwardAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.shape = [
+            paddle.full([], 2, 'int32'),
+            paddle.full([], 3, 'int32'),
+            paddle.full([], 4, 'int32'),
+        ]
+
+    def test_slice(self):
+        starts = [paddle.full([], 1, 'int32'), paddle.full([], 1, 'int32')]
+        ends = [paddle.full([], 3, 'int32'), paddle.full([], 3, 'int32')]
+        x = paddle.rand([5, 3, 3])
+        out = paddle.slice(x, [1, 2], starts, ends)
+        self.assertEqual(out.shape, [5, 2, 2])
+
+    def test_strided_slice(self):
+        starts = [paddle.full([], 0, 'int32'), paddle.full([], 0, 'int32')]
+        ends = [paddle.full([], 4, 'int32'), paddle.full([], 4, 'int32')]
+        strides = [paddle.full([], 2, 'int32'), paddle.full([], 2, 'int32')]
+        x = paddle.rand([5, 5, 5])
+        out = paddle.strided_slice(x, [1, 2], starts, ends, strides)
+        self.assertEqual(out.shape, [5, 2, 2])
+
+    def test_linspace(self):
+        start = paddle.full([], 1.0)
+        stop = paddle.full([], 5.0)
+        num = paddle.full([], 5, 'int32')
+        out = paddle.linspace(start, stop, num)
+        np.testing.assert_array_equal(out.numpy(), [1.0, 2.0, 3.0, 4.0, 5.0])
+
+    def test_logspace(self):
+        start = paddle.full([], 1.0)
+        stop = paddle.full([], 3.0)
+        num = paddle.full([], 5, 'int32')
+        base = paddle.full([], 2.0)
+        out = paddle.logspace(start, stop, num, base)
+        self.assertEqual(out.shape, [5])
+
+    def test_arange(self):
+        start = paddle.full([], 1.0)
+        stop = paddle.full([], 6.0)
+        step = paddle.full([], 1.0)
+        out = paddle.arange(start, stop, step)
+        np.testing.assert_array_equal(out.numpy(), [1.0, 2.0, 3.0, 4.0, 5.0])
+
+    def test_normal(self):
+        mean = paddle.full([], 0.0)
+        std = paddle.full([], 0.0)
+        out = paddle.normal(mean, std)
+        self.assertEqual(out.shape, [])
+
+        out = paddle.normal(0.0, 1.0, [])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.normal(0.0, 1.0, self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_rand(self):
+        out = paddle.rand([])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.rand(self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_randn(self):
+        out = paddle.randn([])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.randn(self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_randint_and_randint_like(self):
+        out = paddle.randint(-10, 10, [])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.randint_like(out, -10, 10)
+        self.assertEqual(out.shape, [])
+
+        out = paddle.randint(-10, 10, self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_standard_normal(self):
+        out = paddle.standard_normal([])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.standard_normal(self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_uniform(self):
+        out = paddle.uniform([])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.uniform(self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_empty_and_empty_like(self):
+        out = paddle.empty([])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.empty_like(out)
+        self.assertEqual(out.shape, [])
+
+        out = paddle.empty(self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_full_and_full_like(self):
+        out = paddle.full([], 0.5)
+        self.assertEqual(out.shape, [])
+
+        out = paddle.full_like(out, 0.5)
+        self.assertEqual(out.shape, [])
+
+        out = paddle.full(self.shape, 0.5)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_ones_and_ones_like(self):
+        out = paddle.ones([])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.ones_like(out)
+        self.assertEqual(out.shape, [])
+
+        out = paddle.ones(self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_zeros_and_zeros_like(self):
+        out = paddle.zeros([])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.zeros_like(out)
+        self.assertEqual(out.shape, [])
+
+        out = paddle.zeros(self.shape)
+        self.assertEqual(out.shape, [2, 3, 4])
+
+    def test_embedding(self):
+        ids = paddle.full(shape=[], fill_value=1, dtype='int64')
+        w0 = paddle.arange(3, 9).reshape((3, 2)).astype(paddle.float32)
+        w = paddle.to_tensor(w0, stop_gradient=False)
+        emb = paddle.nn.functional.embedding(
+            x=ids, weight=w, sparse=True, name="embedding"
+        )
+        self.assertEqual(emb.shape, [2])
+        res = [5.0, 6.0]
+        for i in range(len(res)):
+            self.assertEqual(emb.numpy()[i], res[i])
+
+    def test_one_hot_label(self):
+        label = paddle.full(shape=[], fill_value=2, dtype='int64')
+        one_hot_label = paddle.nn.functional.one_hot(label, num_classes=4)
+        self.assertEqual(one_hot_label.shape, [4])
+        self.assertEqual(one_hot_label.numpy()[2], 1)
+
+    def test_unique_consecutive(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            x = paddle.rand([])
+            y, inverse, counts = paddle.unique_consecutive(
+                x,
+                return_inverse=True,
+                return_counts=True,
+            )
+
+            self.assertEqual(y, x)
+            self.assertEqual(inverse, 0)
+            self.assertEqual(counts, 1)
+            self.assertEqual(y.shape, [1])
+            self.assertEqual(inverse.shape, [1])
+            self.assertEqual(counts.shape, [1])
+
+    def test_unique(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            x = paddle.rand([])
+            y, index, inverse, counts = paddle.unique(
+                x,
+                return_index=True,
+                return_inverse=True,
+                return_counts=True,
+            )
+
+            self.assertEqual(y, x)
+            self.assertEqual(index, 0)
+            self.assertEqual(inverse, 0)
+            self.assertEqual(counts, 1)
+            self.assertEqual(y.shape, [1])
+            self.assertEqual(index.shape, [1])
+            self.assertEqual(inverse.shape, [1])
+            self.assertEqual(counts.shape, [1])
+
+    def test_matrix_rank(self):
+        x = paddle.eye(10)
+        x.stop_gradient = False
+        out = paddle.linalg.matrix_rank(x)
+
+        self.assertEqual(out.shape, [])
+        np.testing.assert_equal(out, np.array(10))
+
+        c = paddle.ones(shape=[3, 4, 5])
+        c.stop_gradient = False
+        out_c = paddle.linalg.matrix_rank(c)
+        self.assertEqual(out_c.shape, [3])
+        np.testing.assert_equal(out_c, np.array([1, 1, 1]))
+
+        # 2D, tol->float : OUTPUT 0D
+        x_tol = paddle.eye(10)
+        x_tol.stop_gradient = False
+        out_tol = paddle.linalg.matrix_rank(x_tol, tol=0.1)
+        self.assertEqual(out_tol.shape, [])
+
+        # 3D, tol->float : OUTPUT 1D
+        c_tol = paddle.ones(shape=[3, 4, 5])
+        c_tol.stop_gradient = False
+        out_c_tol = paddle.linalg.matrix_rank(c_tol, tol=0.1)
+        self.assertEqual(out_c_tol.shape, [3])
+
+        tol_2 = paddle.randn([2])
+        # 2D, tol->Tensor[1,2] : OUTPUT 1D
+        d = paddle.eye(10)
+        out_d = paddle.linalg.matrix_rank(d, tol=tol_2)
+        self.assertEqual(out_d.shape, [2])
+
+
+class TestNoBackwardAPIStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.exe = paddle.static.Executor()
+        self.shape = [
+            paddle.full([], 2, 'int32'),
+            paddle.full([], 3, 'int32'),
+            paddle.full([], 4, 'int32'),
+        ]
+
+    def test_slice(self):
+        starts = [paddle.full([], 1, 'int32'), paddle.full([], 1, 'int32')]
+        ends = [paddle.full([], 3, 'int32'), paddle.full([], 3, 'int32')]
+        x = paddle.rand([5, 3, 3])
+        out = paddle.slice(x, [1, 2], starts, ends)
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out]
+        )[0]
+        self.assertEqual(res.shape, (5, 2, 2))
+
+    @test_with_pir_api
+    def test_strided_slice(self):
+        starts = [paddle.full([], 0, 'int32'), paddle.full([], 0, 'int32')]
+        ends = [paddle.full([], 4, 'int32'), paddle.full([], 4, 'int32')]
+        strides = [paddle.full([], 2, 'int32'), paddle.full([], 2, 'int32')]
+        x = paddle.rand([5, 5, 5])
+        out = paddle.strided_slice(x, [1, 2], starts, ends, strides)
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out]
+        )[0]
+        self.assertEqual(res.shape, (5, 2, 2))
+
+    def test_linspace(self):
+        start = paddle.full([], 1.0)
+        stop = paddle.full([], 5.0)
+        num = paddle.full([], 5, 'int32')
+        out = paddle.linspace(start, stop, num)
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out]
+        )[0]
+        np.testing.assert_array_equal(res, [1.0, 2.0, 3.0, 4.0, 5.0])
+
+    @test_with_pir_api
+    def test_arange(self):
+        start = paddle.full([], 1.0)
+        stop = paddle.full([], 6.0)
+        step = paddle.full([], 1.0)
+        out = paddle.arange(start, stop, step)
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out]
+        )[0]
+        np.testing.assert_array_equal(res, [1.0, 2.0, 3.0, 4.0, 5.0])
+
+    def test_normal(self):
+        mean = paddle.full([], 0.0)
+        std = paddle.full([], 0.0)
+        out1 = paddle.normal(mean, std)
+        out2 = paddle.normal(0.0, 1.0, [])
+        out3 = paddle.normal(0.0, 1.0, self.shape)
+
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out1, out2, out3]
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (2, 3, 4))
+
+    def test_rand(self):
+        out1 = paddle.rand([])
+        out2 = paddle.rand(self.shape)
+
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out1, out2]
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2, 3, 4))
+
+    def test_randn(self):
+        out1 = paddle.randn([])
+        out2 = paddle.randn(self.shape)
+
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out1, out2]
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2, 3, 4))
+
+    @test_with_pir_api
+    def test_randint(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            out1 = paddle.randint(-10, 10, [])
+
+            shape = [
+                paddle.full([], 2, 'int32'),
+                paddle.full([], 3, 'int32'),
+                paddle.full([], 4, 'int32'),
+            ]
+            out2 = paddle.randint(-10, 10, shape)
+
+            res = self.exe.run(
+                paddle.static.default_main_program(), fetch_list=[out1, out2]
+            )
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2, 3, 4))
+
+    @test_with_pir_api
+    def test_randint_like(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            out1 = paddle.rand([])
+            out2 = paddle.randint_like(out1, -10, 10)
+
+            res = self.exe.run(
+                paddle.static.default_main_program(), fetch_list=[out1, out2]
+            )
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+
+    def test_standard_normal(self):
+        out1 = paddle.standard_normal([])
+        out2 = paddle.standard_normal(self.shape)
+
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out1, out2]
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2, 3, 4))
+
+    def test_uniform(self):
+        out1 = paddle.uniform([])
+        out2 = paddle.uniform(self.shape)
+
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out1, out2]
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2, 3, 4))
+
+    def test_empty_and_empty_like(self):
+        out1 = paddle.empty([])
+        out2 = paddle.empty_like(out1)
+        out3 = paddle.empty(self.shape)
+
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out1, out2, out3]
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (2, 3, 4))
+
+    def test_full_and_full_like(self):
+        out1 = paddle.full([], 0.5)
+        out2 = paddle.full_like(out1, 0.5)
+        out3 = paddle.full(self.shape, 0.5)
+        out4 = paddle.full(self.shape, paddle.full([], 0.5))
+
+        res = self.exe.run(
+            paddle.static.default_main_program(),
+            fetch_list=[out1, out2, out3, out4],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (2, 3, 4))
+        self.assertEqual(res[3].shape, (2, 3, 4))
+
+    def test_ones_and_ones_like(self):
+        out1 = paddle.ones([])
+        out2 = paddle.ones_like(out1)
+        out3 = paddle.ones(self.shape)
+
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out1, out2, out3]
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (2, 3, 4))
+
+    def test_zeros_and_zeros_like(self):
+        out1 = paddle.zeros([])
+        out2 = paddle.zeros_like(out1)
+        out3 = paddle.zeros(self.shape)
+
+        res = self.exe.run(
+            paddle.static.default_main_program(), fetch_list=[out1, out2, out3]
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (2, 3, 4))
+
+    def test_embedding(self):
+        ids = paddle.full(shape=[], fill_value=1, dtype='int64')
+        w0 = paddle.arange(3, 9).reshape((3, 2)).astype(paddle.float32)
+        w = paddle.to_tensor(w0, stop_gradient=False)
+        emb = paddle.nn.functional.embedding(
+            x=ids, weight=w, sparse=True, name="embedding"
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[emb])
+        self.assertEqual(res[0].shape, (2,))
+        result = [5.0, 6.0]
+        for i in range(len(res)):
+            self.assertEqual(res[0][i], result[i])
+
+    def test_static_embedding(self):
+        ids = paddle.full(shape=[], fill_value=1, dtype='int64')
+        emb = paddle.static.nn.embedding(ids, (20, 3))
+        prog = paddle.static.default_main_program()
+        self.exe.run(paddle.static.default_startup_program())
+        res = self.exe.run(prog, fetch_list=[emb])
+        self.assertEqual(res[0].shape, (3,))
+
+    @test_with_pir_api
+    def test_one_hot_label(self):
+        label = paddle.full(shape=[], fill_value=2, dtype='int64')
+        one_hot_label = paddle.nn.functional.one_hot(label, num_classes=4)
+        prog = paddle.static.default_main_program()
+        self.exe.run(paddle.static.default_startup_program())
+        res = self.exe.run(prog, fetch_list=[one_hot_label])
+
+        self.assertEqual(res[0].shape, (4,))
+        self.assertEqual(res[0][2], 1)
+
+    def test_unique_consecutive(self):
+        x = paddle.rand([])
+        y, inverse, counts = paddle.unique_consecutive(
+            x, return_inverse=True, return_counts=True
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[y, inverse, counts])
+        self.assertEqual(y, x)
+        self.assertEqual(inverse, 0)
+        self.assertEqual(counts, 1)
+        self.assertEqual(res[0].shape, (1,))
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[2].shape, (1,))
+
+    def test_unique(self):
+        x = paddle.rand([])
+        y, index, inverse, counts = paddle.unique(
+            x, return_index=True, return_inverse=True, return_counts=True
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[y, index, inverse, counts])
+        self.assertEqual(y, x)
+        self.assertEqual(index, 0)
+        self.assertEqual(inverse, 0)
+        self.assertEqual(counts, 1)
+        self.assertEqual(res[0].shape, (1,))
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[2].shape, (1,))
+        self.assertEqual(res[3].shape, (1,))
+
+    @test_with_pir_api
+    def test_static_matrix_rank(self):
+        # 2D : OUTPUT 0D
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.eye(10)
+            x.stop_gradient = False
+            out = paddle.linalg.matrix_rank(x)
+            exe = paddle.static.Executor()
+            res = exe.run(fetch_list=[out])
+            self.assertEqual(res[0].shape, ())
+
+        # 3D : OUTPUT 1D
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            c = paddle.ones(shape=[3, 4, 5])
+            c.stop_gradient = False
+            out_c = paddle.linalg.matrix_rank(c)
+            exe = paddle.static.Executor()
+            res = exe.run(fetch_list=[out_c])
+            self.assertEqual(res[0].shape, (3,))
+
+        # 2D, tol->float : OUTPUT 0D
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x_tol = paddle.eye(10)
+            x_tol.stop_gradient = False
+            out_tol = paddle.linalg.matrix_rank(x_tol, tol=0.1)
+            exe = paddle.static.Executor()
+            res = exe.run(fetch_list=[out_tol])
+            self.assertEqual(res[0].shape, ())
+
+        # 3D, tol->float : OUTPUT 1D
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            c_tol = paddle.ones(shape=[3, 4, 5])
+            c_tol.stop_gradient = False
+            out_c_tol = paddle.linalg.matrix_rank(c_tol, tol=0.1)
+            exe = paddle.static.Executor()
+            res = exe.run(fetch_list=[out_c_tol])
+            self.assertEqual(res[0].shape, (3,))
+
+        # 2D, tol->Tensor[1,2] : OUTPUT 1D
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            tol_2 = paddle.randn([2])
+            d = paddle.eye(10)
+            out_d = paddle.linalg.matrix_rank(d, tol=tol_2)
+            exe = paddle.static.Executor()
+            res = exe.run(fetch_list=[out_d])
+            self.assertEqual(res[0].shape, (2,))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_reduce_api.py b/test/legacy_test/test_zero_dim_reduce_api.py
new file mode 100644
index 0000000000000..1f663dcc704b5
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_reduce_api.py
@@ -0,0 +1,266 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+reduce_api_list = [
+    paddle.sum,
+    paddle.mean,
+    paddle.nansum,
+    paddle.nanmean,
+    paddle.median,
+    paddle.nanmedian,
+    paddle.min,
+    paddle.max,
+    paddle.amin,
+    paddle.amax,
+    paddle.prod,
+    paddle.logsumexp,
+    paddle.all,
+    paddle.any,
+    paddle.count_nonzero,
+]
+
+
+# Use to test zero-dim of reduce API
+class TestReduceAPI(unittest.TestCase):
+    def assertShapeEqual(self, out, target_tuple):
+        if not paddle.framework.in_pir_mode():
+            out_shape = list(out.shape)
+        else:
+            out_shape = out.shape
+        self.assertEqual(out_shape, target_tuple)
+
+    def test_dygraph_reduce(self):
+        paddle.disable_static()
+        for api in reduce_api_list:
+            # 1) x is 0D
+            if api in [paddle.all, paddle.any]:
+                x = paddle.randint(0, 2, []).astype('bool')
+            else:
+                x = paddle.rand([])
+            x.stop_gradient = False
+            out = api(x, axis=None)
+
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(x.shape, [])
+            self.assertEqual(out.shape, [])
+            if api not in [paddle.count_nonzero]:
+                np.testing.assert_allclose(out.numpy(), x.numpy())
+
+            if api not in [paddle.median, paddle.nanmedian]:
+                out_empty_list = api(x, axis=[])
+                self.assertEqual(out_empty_list, out)
+                self.assertEqual(out_empty_list.shape, [])
+
+            if x.grad is not None:
+                self.assertEqual(x.grad.shape, [])
+                self.assertEqual(out.grad.shape, [])
+                np.testing.assert_allclose(x.grad.numpy(), np.array(1.0))
+                np.testing.assert_allclose(out.grad.numpy(), np.array(1.0))
+
+            out1 = api(x, axis=0)
+            self.assertEqual(out1.shape, [])
+            self.assertEqual(out1, out)
+            out1.backward()
+
+            out2 = api(x, axis=-1)
+            self.assertEqual(out2.shape, [])
+            self.assertEqual(out2, out)
+            out2.backward()
+
+            if x.grad is not None:
+                self.assertEqual(x.grad.shape, [])
+                np.testing.assert_allclose(x.grad.numpy(), np.array(3.0))
+
+            # 2) x is 1D, axis=0, reduce to 0D
+            if api in [paddle.all, paddle.any]:
+                x = paddle.randint(0, 2, [5]).astype('bool')
+            else:
+                x = paddle.rand([5])
+            x.stop_gradient = False
+            out = api(x, axis=0)
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(out.shape, [])
+            if x.grad is not None:
+                self.assertEqual(out.grad.shape, [])
+                self.assertEqual(x.grad.shape, [5])
+
+            # 3) x is ND, reduce to 0D
+            if api in [paddle.all, paddle.any]:
+                x = paddle.randint(0, 2, [3, 5]).astype('bool')
+            else:
+                x = paddle.rand([3, 5])
+            x.stop_gradient = False
+            out = api(x, axis=None)
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(out.shape, [])
+            if x.grad is not None:
+                self.assertEqual(out.grad.shape, [])
+                self.assertEqual(x.grad.shape, [3, 5])
+
+            # 4) x is ND, reduce to 0D, keepdim=True
+            if api in [paddle.all, paddle.any]:
+                x = paddle.randint(0, 2, [3, 5]).astype('bool')
+            else:
+                x = paddle.rand([3, 5])
+            x.stop_gradient = False
+            out = api(x, keepdim=True)
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(out.shape, [1, 1])
+            if x.grad is not None:
+                self.assertEqual(out.grad.shape, [1, 1])
+                self.assertEqual(x.grad.shape, [3, 5])
+
+        paddle.enable_static()
+
+    # TODO(SigureMo): Temporarily disable this test case in due to hanging in mac CI.
+    # @test_with_pir_api
+    def test_static_reduce(self):
+        paddle.enable_static()
+        for api in reduce_api_list:
+            main_prog = paddle.static.Program()
+            block = main_prog.global_block()
+            exe = paddle.static.Executor()
+            with paddle.static.program_guard(
+                main_prog, paddle.static.Program()
+            ):
+                # 1) x is 0D
+                if api in [paddle.all, paddle.any]:
+                    x = paddle.randint(0, 2, []).astype('bool')
+                else:
+                    x = paddle.rand([])
+                x.stop_gradient = False
+                out = api(x, axis=None)
+                grad_list = paddle.static.append_backward(
+                    out, parameter_list=[x, out]
+                )
+
+                if api not in [paddle.median, paddle.nanmedian]:
+                    out_empty_list = api(x, axis=[])
+                    self.assertShapeEqual(out_empty_list, [])
+
+                out1 = api(x, axis=0)
+                self.assertShapeEqual(out1, [])
+
+                out2 = api(x, axis=-1)
+                self.assertShapeEqual(out2, [])
+
+                fetch_list = [x, out]
+
+                fetch_list.extend(
+                    [
+                        _grad
+                        for _param, _grad in grad_list
+                        if isinstance(
+                            _grad,
+                            (paddle.pir.Value, paddle.base.framework.Variable),
+                        )
+                    ]
+                )
+                res = exe.run(main_prog, fetch_list=fetch_list)
+
+                self.assertEqual(res[0].shape, ())
+                self.assertEqual(res[1].shape, ())
+                if api not in [paddle.count_nonzero]:
+                    np.testing.assert_allclose(res[0], res[1])
+
+                if len(res) > 2:
+                    self.assertEqual(res[2].shape, ())
+                    self.assertEqual(res[3].shape, ())
+                    np.testing.assert_allclose(res[2], np.array(1.0))
+                    np.testing.assert_allclose(res[3], np.array(1.0))
+
+                # 2) x is ND, reduce to 0D
+                if api in [paddle.all, paddle.any]:
+                    x = paddle.randint(0, 2, [3, 5]).astype('bool')
+                else:
+                    x = paddle.rand([3, 5])
+                x.stop_gradient = False
+                out = api(x, axis=None)
+                grad_list = paddle.static.append_backward(
+                    out, parameter_list=[out, x]
+                )
+
+                fetch_list = [out]
+                fetch_list.extend(
+                    [
+                        _grad
+                        for _param, _grad in grad_list
+                        if isinstance(
+                            _grad,
+                            (paddle.pir.Value, paddle.base.framework.Variable),
+                        )
+                    ]
+                )
+
+                res = exe.run(main_prog, fetch_list=fetch_list)
+                self.assertEqual(res[0].shape, ())
+                if len(res) > 1:
+                    self.assertEqual(res[1].shape, ())
+                if len(res) > 2:
+                    self.assertEqual(res[2].shape, (3, 5))
+
+                # 3) x is 1D, axis=0, reduce to 0D
+                if api in [paddle.all, paddle.any]:
+                    x = paddle.randint(0, 2, [5]).astype('bool')
+                else:
+                    x = paddle.rand([5])
+                x.stop_gradient = False
+                out = api(x, axis=0)
+                grad_list = paddle.static.append_backward(
+                    out, parameter_list=[out, x]
+                )
+
+                fetch_list = [out]
+                fetch_list.extend(
+                    [
+                        _grad
+                        for _param, _grad in grad_list
+                        if isinstance(
+                            _grad,
+                            (paddle.pir.Value, paddle.base.framework.Variable),
+                        )
+                    ]
+                )
+
+                res = exe.run(main_prog, fetch_list=fetch_list)
+                self.assertEqual(res[0].shape, ())
+                if len(res) > 1:
+                    self.assertEqual(res[1].shape, ())
+                if len(res) > 2:
+                    self.assertEqual(res[2].shape, (5,))
+
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_sundry_dygraph_api.py b/test/legacy_test/test_zero_dim_sundry_dygraph_api.py
new file mode 100644
index 0000000000000..00f32fe874413
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_sundry_dygraph_api.py
@@ -0,0 +1,2356 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import os
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import base, core
+from paddle.framework import in_dynamic_mode
+
+
+# Use to test zero-dim of Sundry API, which is unique and can not be classified
+# with others. It can be implemented here flexibly.
+class TestSundryAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.x = paddle.rand([])
+
+    def test_polygamma(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.polygamma(x, 2)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_frexp(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out1, out2 = paddle.frexp(x)
+        out1.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_pairwise_distance(self):
+        x = paddle.rand([5])
+        x.stop_gradient = False
+        y = paddle.rand([5])
+        y.stop_gradient = False
+
+        out = paddle.nn.functional.pairwise_distance(x, y)
+        out.backward()
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [5])
+
+    def test_take(self):
+        x = paddle.rand([4, 5])
+        x.stop_gradient = False
+        out = paddle.take(x, paddle.to_tensor(2))
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [4, 5])
+        np.testing.assert_allclose(x.grad[0, 2], 1.0)
+
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.take(x, paddle.to_tensor(0))
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        np.testing.assert_allclose(out, x)
+        self.assertEqual(x.grad.shape, [])
+        np.testing.assert_allclose(x.grad.numpy(), 1.0)
+
+    def test_trapezoid(self):
+        y = paddle.rand([5])
+        y.stop_gradient = False
+        out = paddle.trapezoid(y, dx=2.0)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(y.grad.shape, [5])
+
+    def test_create_parameter_var(self):
+        zero_dim_param = paddle.create_parameter(shape=[], dtype='float32')
+        self.assertEqual(zero_dim_param.shape, [])
+
+        zero_dim_var = paddle.tensor.creation.create_global_var(
+            shape=[], value=0.5, dtype='float32'
+        )
+        self.assertEqual(zero_dim_var.shape, [])
+        self.assertEqual(zero_dim_var.item(), 0.5)
+
+    def test_getitem(self):
+        # case1: When all axis have a scalar indice, output should be a 0-d Tensor;
+        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
+        x.stop_gradient = False
+        out = x[1, 2, 3, 4]
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(out.shape, [])
+        np.testing.assert_allclose(out, np.array(119))
+        self.assertEqual(out.grad.shape, [])
+        np.testing.assert_allclose(out.grad, 1.0)
+        self.assertEqual(x.grad.shape, [2, 3, 4, 5])
+        x_grad_expected = np.zeros((2, 3, 4, 5))
+        x_grad_expected[1, 2, 3, 4] = 1.0
+        np.testing.assert_allclose(x.grad, x_grad_expected)
+
+        # case2: When one axis has a 0-d Tensor indice, the output should be same as int indice.
+        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
+        out1 = x[1, 2]
+        out2 = x[
+            paddle.full([], 1, dtype='int32'), paddle.full([], 2, dtype='int32')
+        ]
+        np.testing.assert_allclose(out1, out2)
+
+        # case3: When all axis have a scalar indice (i.e. case1) and has None indice,
+        # ndim of output should be same with numbers of None.
+        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
+        out1 = x[1, 2, None, 3, 4]
+        self.assertEqual(out1.shape, [1])
+        np.testing.assert_allclose(out1, np.array([119]))
+        out2 = x[1, None, 2, None, 3, 4]
+        self.assertEqual(out2.shape, [1, 1])
+        np.testing.assert_allclose(out2, np.array([[119]]))
+
+        # case4: 1-D Tensor will be treated as vector, no axis decrease will happen.
+        x = paddle.ones((2, 3, 4))
+        indice = paddle.ones([1], dtype='int32')
+        out1 = x[indice]
+        self.assertEqual(out1.shape, [1, 3, 4])
+        np.testing.assert_allclose(out1, np.ones((1, 3, 4)))
+        out2 = x[indice, indice]
+        self.assertEqual(out2.shape, [1, 4])
+        np.testing.assert_allclose(out2, np.ones((1, 4)))
+
+    def test_setitem(self):
+        # case1: all axis have a scalar indice
+        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
+        x.stop_gradient = False
+        out = x * 2
+        out[1, 2, 3, 4] = 10
+        out.backward()
+
+        self.assertEqual(out.shape, x.shape)
+        np.testing.assert_allclose(out[1, 2, 3, 4], np.array(10))
+        self.assertEqual(x.grad.shape, [2, 3, 4, 5])
+        x_grad_expected = np.ones((2, 3, 4, 5)) * 2
+        x_grad_expected[1, 2, 3, 4] = 0
+        np.testing.assert_allclose(x.grad, x_grad_expected)
+
+        # case2: 0-D Tensor indice in some axis
+        # NOTE(zoooo0820): Now, int/slice with 0-D Tensor will still be
+        # treated as combined indexing, which is not support backward.
+        # There should have more test cases such as out[1, indice, :] = 0.5 when this
+        # problem is fixed.
+        x = paddle.randn((2, 3, 4, 5))
+        x.stop_gradient = False
+        indice = paddle.full([], 1, dtype='int32')
+        out = x * 1
+        out[indice, indice] = 0.5
+        out.backward()
+
+        self.assertEqual(out.shape, x.shape)
+        np.testing.assert_allclose(out[1, 1], np.ones((4, 5)) * 0.5)
+        x_grad_expected = np.ones((2, 3, 4, 5))
+        x_grad_expected[1, 1] = 0
+        np.testing.assert_allclose(x.grad, x_grad_expected)
+
+        # case3：0-D Tensor indice in some axis, value is a Tensor
+        # and there is broadcast
+        x = paddle.randn((2, 3, 4, 5))
+        x.stop_gradient = False
+        v = paddle.ones((4, 5), dtype='float32') * 5
+        v.stop_gradient = False
+        indice = paddle.full([], 1, dtype='int32')
+        out = x * 1
+        out[indice] = v
+        out.backward()
+
+        self.assertEqual(out.shape, x.shape)
+        np.testing.assert_allclose(out[1], np.ones((3, 4, 5)) * 5)
+        x_grad_expected = np.ones((2, 3, 4, 5))
+        x_grad_expected[1] = 0
+        np.testing.assert_allclose(x.grad, x_grad_expected)
+        value_grad_expected = np.ones((4, 5)) * 3
+        np.testing.assert_allclose(v.grad, value_grad_expected)
+
+        # case4: value is a 0-D tensor and there is broadcast
+        x = paddle.randn((2, 3, 4, 5))
+        x.stop_gradient = False
+        v = paddle.ones([], dtype='float32') * 5
+        v.stop_gradient = False
+        out = x * 1
+        indice = paddle.full([], 0, dtype='int32')
+        out[indice] = v
+        out.backward()
+
+        self.assertEqual(out.shape, x.shape)
+        self.assertEqual(v.grad.shape, [])
+        np.testing.assert_allclose(out[0], np.ones((3, 4, 5)) * 5)
+        x_grad_expected = np.ones((2, 3, 4, 5))
+        x_grad_expected[0] = 0
+        np.testing.assert_allclose(x.grad, x_grad_expected)
+        value_grad_expected = np.ones(()) * 3 * 4 * 5
+        np.testing.assert_allclose(v.grad, value_grad_expected)
+
+        # case5: indice / value is 0-D Tensor, and there is no broadcast
+        x = paddle.randn((2, 3, 4, 5))
+        x.stop_gradient = False
+        v = paddle.ones([], dtype='float32') * 2
+        v.stop_gradient = False
+        out = x * 1
+        indice = paddle.full([], 0, dtype='int32')
+        out[indice, indice, indice, indice] = v
+        out.backward()
+
+        self.assertEqual(out.shape, x.shape)
+        self.assertEqual(v.grad.shape, [])
+        np.testing.assert_allclose(out[0, 0, 0, 0], np.ones(()) * 2)
+        x_grad_expected = np.ones((2, 3, 4, 5))
+        x_grad_expected[0, 0, 0, 0] = 0
+        np.testing.assert_allclose(x.grad, x_grad_expected)
+        value_grad_expected = np.ones(())
+        np.testing.assert_allclose(v.grad, value_grad_expected)
+
+    def test_expand(self):
+        # case1
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+        out = paddle.expand(x, shape=[1])
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [1])
+        np.testing.assert_allclose(out, 1.0)
+        self.assertEqual(x.grad.shape, [])
+        np.testing.assert_allclose(x.grad, 1.0)
+        self.assertEqual(out.grad.shape, [1])
+        np.testing.assert_allclose(out.grad, 1.0)
+
+        # case2
+        x1 = paddle.full([], 1, 'float32')
+        x1.stop_gradient = False
+        out1 = paddle.expand(x1, shape=[])
+        out1.retain_grads()
+        out1.backward()
+
+        self.assertEqual(out1.shape, [])
+        np.testing.assert_allclose(out1, 1.0)
+        self.assertEqual(x1.grad.shape, [])
+        np.testing.assert_allclose(x1.grad, 1.0)
+        self.assertEqual(out1.grad.shape, [])
+        np.testing.assert_allclose(out1.grad, 1.0)
+
+        # case3
+        x2 = paddle.full([], 1, 'float32')
+        x2.stop_gradient = False
+        out2 = paddle.expand(x2, shape=[1, 1])
+        out2.retain_grads()
+        out2.backward()
+
+        self.assertEqual(out2.shape, [1, 1])
+        np.testing.assert_allclose(out2, 1.0)
+        self.assertEqual(x2.grad.shape, [])
+        np.testing.assert_allclose(x2.grad, 1.0)
+        self.assertEqual(out2.grad.shape, [1, 1])
+        np.testing.assert_allclose(out2.grad, 1.0)
+
+        # case4
+        x3 = paddle.full([], 1, 'float32')
+        x3.stop_gradient = False
+        out3 = paddle.expand(x3, shape=[3, 3])
+        out3.retain_grads()
+        out3.backward()
+
+        self.assertEqual(out3.shape, [3, 3])
+        np.testing.assert_allclose(out3, 1.0)
+        self.assertEqual(x3.grad.shape, [])
+        np.testing.assert_allclose(x3.grad, 9.0)
+        self.assertEqual(out3.grad.shape, [3, 3])
+        np.testing.assert_allclose(out3.grad, 1.0)
+
+    def test_expand_as(self):
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+        y = paddle.full([], 1, 'float32')
+        y.stop_gradient = False
+        out = paddle.expand_as(x, y)
+        out.backward()
+        self.assertEqual(x.shape, [])
+        self.assertEqual(x.item(), 1.0)
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad.item(), 1.0)
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.item(), 1.0)
+        self.assertEqual(out.grad, None)
+
+        x1 = paddle.full([], 1, 'float32')
+        x1.stop_gradient = False
+        y1 = paddle.full([1], 1, 'float32')
+        out1 = paddle.expand_as(x1, y1)
+        out1.backward()
+        self.assertEqual(x1.shape, [])
+        self.assertEqual(x1.item(), 1.0)
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x1.grad.item(0), 1.0)
+        self.assertEqual(out1.shape, [1])
+        self.assertEqual(out1.item(0), 1.0)
+        self.assertEqual(out1.grad, None)
+
+        x2 = paddle.full([], 1, 'float32')
+        x2.stop_gradient = False
+        y2 = paddle.full([3, 3], 1, 'float32')
+        out2 = paddle.expand_as(x2, y2)
+        out2.backward()
+        self.assertEqual(x2.shape, [])
+        self.assertEqual(x2.item(), 1.0)
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x2.grad.item(0), 9.0)
+        self.assertEqual(out2.shape, [3, 3])
+        self.assertEqual(out2.item(0), 1.0)
+        self.assertEqual(out2.grad, None)
+
+    def test_top_k(self):
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+        out, indices = paddle.topk(x, k=1, axis=0)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(indices.shape, [])
+        self.assertEqual(indices.item(), 0)
+        self.assertEqual(x.shape, [])
+        self.assertEqual(x.item(), 1.0)
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad.item(0), 1.0)
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.item(), 1.0)
+        self.assertEqual(out.grad, 1.0)
+
+        x1 = paddle.full([], 1, 'float32')
+        x1.stop_gradient = False
+        out1, indices1 = paddle.topk(x1, k=1, axis=-1)
+        out1.retain_grads()
+        out1.backward()
+        self.assertEqual(indices1.shape, [])
+        self.assertEqual(indices1.item(), 0)
+        self.assertEqual(x1.shape, [])
+        self.assertEqual(x1.item(), 1.0)
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad.item(0), 1.0)
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1.item(), 1.0)
+        self.assertEqual(out1.grad, 1.0)
+
+        with self.assertRaises(ValueError):
+            tmp = paddle.topk(x1, k=1, axis=2)
+
+    def test_broadcast_to(self):
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+        out = paddle.broadcast_to(x, shape=[1])
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [1])
+        np.testing.assert_allclose(out, 1.0)
+        self.assertEqual(x.grad.shape, [])
+        np.testing.assert_allclose(x.grad, 1.0)
+        self.assertEqual(out.grad.shape, [1])
+        np.testing.assert_allclose(out.grad, 1.0)
+
+        # case2
+        x1 = paddle.full([], 1, 'float32')
+        x1.stop_gradient = False
+        out1 = paddle.broadcast_to(x1, shape=[])
+        out1.retain_grads()
+        out1.backward()
+
+        self.assertEqual(out1.shape, [])
+        np.testing.assert_allclose(out1, 1.0)
+        self.assertEqual(x1.grad.shape, [])
+        np.testing.assert_allclose(x1.grad, 1.0)
+        self.assertEqual(out1.grad.shape, [])
+        np.testing.assert_allclose(out1.grad, 1.0)
+
+        # case3
+        x2 = paddle.full([], 1, 'float32')
+        x2.stop_gradient = False
+        out2 = paddle.broadcast_to(x2, shape=[1, 1])
+        out2.retain_grads()
+        out2.backward()
+
+        self.assertEqual(out2.shape, [1, 1])
+        np.testing.assert_allclose(out2, 1.0)
+        self.assertEqual(x2.grad.shape, [])
+        np.testing.assert_allclose(x2.grad, 1.0)
+        self.assertEqual(out2.grad.shape, [1, 1])
+        np.testing.assert_allclose(out2.grad, 1.0)
+
+        # case4
+        x3 = paddle.full([], 1, 'float32')
+        x3.stop_gradient = False
+        out3 = paddle.broadcast_to(x3, shape=[3, 3])
+        out3.retain_grads()
+        out3.backward()
+
+        self.assertEqual(out3.shape, [3, 3])
+        np.testing.assert_allclose(out3, 1.0)
+        self.assertEqual(x3.grad.shape, [])
+        np.testing.assert_allclose(x3.grad, 9.0)
+        self.assertEqual(out3.grad.shape, [3, 3])
+        np.testing.assert_allclose(out3.grad, 1.0)
+
+    def test_broadcast_tensors(self):
+        # 1) x is 0D, y is 0D
+        x1 = paddle.full([], 2.0)
+        x1.stop_gradient = False
+        x2 = paddle.full([], 2.0)
+        x2.stop_gradient = False
+        out1, out2 = paddle.broadcast_tensors([x1, x2])
+        # backward has bug now
+        # out1.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        # self.assertEqual(x1.grad.shape, [])
+
+        # 2) x is ND , y is 0D
+        x1 = paddle.full([2, 3], 2.0)
+        x1.stop_gradient = False
+        x2 = paddle.full([], 2.0)
+        x2.stop_gradient = False
+        out1, out2 = paddle.broadcast_tensors([x1, x2])
+        # out1.backward()
+
+        self.assertEqual(out1.shape, [2, 3])
+        self.assertEqual(out2.shape, [2, 3])
+        # self.assertEqual(x1.grad.shape, [2, 3])
+
+        # 3) x is 0D , y is ND
+        x1 = paddle.full([], 2.0)
+        x1.stop_gradient = False
+        x2 = paddle.full([2, 3], 2.0)
+        x2.stop_gradient = False
+        out1, out2 = paddle.broadcast_tensors([x1, x2])
+        # out1.backward()
+
+        self.assertEqual(out1.shape, [2, 3])
+        self.assertEqual(out2.shape, [2, 3])
+        # self.assertEqual(x1.grad.shape, [2, 3])
+
+    def test_broadcast_shape(self):
+        x = []
+        y = [3, 5]
+        out = paddle.broadcast_shape(x, y)
+        self.assertEqual(out, [3, 5])
+
+        x = [3, 5]
+        y = []
+        out = paddle.broadcast_shape(x, y)
+        self.assertEqual(out, [3, 5])
+
+        x = []
+        y = []
+        out = paddle.broadcast_shape(x, y)
+        self.assertEqual(out, [])
+
+        self.assertEqual(out, [])
+
+    def test_argmin(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        out1 = paddle.argmin(x, 0)
+        out2 = paddle.argmin(x, -1)
+        out3 = paddle.argmin(x, None)
+
+        self.assertEqual(out1.shape, [])
+        np.testing.assert_allclose(out1, 0)
+
+        self.assertEqual(out2.shape, [])
+        np.testing.assert_allclose(out2, 0)
+
+        self.assertEqual(out3.shape, [])
+        np.testing.assert_allclose(out3, 0)
+
+        # 2) x is 1D
+        x = paddle.rand([5])
+        x.stop_gradient = False
+        out = paddle.argmin(x, 0)
+        out.backward()
+        self.assertEqual(out.shape, [])
+
+        # 3) x is ND
+        x = paddle.rand([3, 5])
+        x.stop_gradient = False
+        out = paddle.argmin(x)
+        out.backward()
+        self.assertEqual(out.shape, [])
+
+        # 4) x is ND, keepdim=True
+        x = paddle.rand([3, 5])
+        x.stop_gradient = False
+        out = paddle.argmin(x, keepdim=True)
+        out.backward()
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_argmax(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        out1 = paddle.argmax(x, 0)
+        out2 = paddle.argmax(x, -1)
+        out3 = paddle.argmax(x, None)
+
+        self.assertEqual(out1.shape, [])
+        np.testing.assert_allclose(out1, 0)
+
+        self.assertEqual(out2.shape, [])
+        np.testing.assert_allclose(out2, 0)
+
+        self.assertEqual(out3.shape, [])
+        np.testing.assert_allclose(out3, 0)
+
+        # 2) x is 1D
+        x = paddle.rand([5])
+        out = paddle.argmax(x, 0)
+        self.assertEqual(out.shape, [])
+
+        # 3) x is ND
+        x = paddle.rand([3, 5])
+        out = paddle.argmax(x)
+        self.assertEqual(out.shape, [])
+
+        # 4) x is ND, keepdim=True
+        x = paddle.rand([3, 5])
+        out = paddle.argmax(x, keepdim=True)
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_kthvalue(self):
+        # 1) x is 0D
+        x = paddle.randn([])
+        x.stop_gradient = False
+        out, index = paddle.kthvalue(x, 1)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out, x)
+        self.assertEqual(index.shape, [])
+        self.assertEqual(index, 0)
+
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad, 1.0)
+
+        # 2) x is 1D
+        x1 = paddle.randn([5])
+        x1.stop_gradient = False
+        out1, index1 = paddle.kthvalue(x1, 1)
+        out1.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(index1.shape, [])
+        self.assertEqual(x1.grad.shape, [5])
+
+    def test_mode(self):
+        x1 = paddle.randn([5])
+        x1.stop_gradient = False
+        out1, index1 = paddle.mode(x1)
+        out1.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(index1.shape, [])
+
+        self.assertEqual(x1.grad.shape, [5])
+
+    def test_is_empty(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        out = paddle.is_empty(x)
+        self.assertFalse(out)
+        self.assertEqual(out.shape, [])
+
+        # 2) x is 1D
+        x = paddle.rand([5])
+        out = paddle.is_empty(x)
+        self.assertFalse(out)
+        self.assertEqual(out.shape, [])
+
+        # 3) x is ND
+        x = paddle.rand([3, 5])
+        out = paddle.is_empty(x)
+        self.assertFalse(out)
+        self.assertEqual(out.shape, [])
+
+        x = paddle.rand([3, 0, 5])
+        out = paddle.is_empty(x)
+        self.assertTrue(out)
+        self.assertEqual(out.shape, [])
+
+    def test_squeeze_(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        x.squeeze_(0)
+        self.assertEqual(x.shape, [])
+
+        # 2) x is 1D
+        x = paddle.rand([1])
+        x.squeeze_(0)
+        self.assertEqual(x.shape, [])
+
+        # 3）x is ND
+        x = paddle.rand([2, 1])
+        x.squeeze_(1)
+        self.assertEqual(x.shape, [2])
+
+    def test_as_complex(self):
+        x = paddle.rand([2])
+        x.stop_gradient = False
+        out = paddle.as_complex(x)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.shape, [2])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [2])
+        self.assertEqual(out.grad.shape, [])
+
+    def test_dot(self):
+        # 1) x is 1D
+        x = paddle.rand([2])
+        x.stop_gradient = False
+        y = paddle.rand([2])
+        y.stop_gradient = False
+        out = paddle.dot(x, y)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.grad.shape, [2])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        # 2) x is 2D
+        x1 = paddle.rand([2, 2])
+        x1.stop_gradient = False
+        y1 = paddle.rand([2, 2])
+        y1.stop_gradient = False
+        out1 = paddle.dot(x1, y1)
+        out1.retain_grads()
+        out1.backward()
+
+        self.assertEqual(x1.grad.shape, [2, 2])
+        self.assertEqual(out1.shape, [2])
+        self.assertEqual(out1.grad.shape, [2])
+
+    def test_inner(self):
+        # 0) input is 0D
+        x = paddle.rand([])
+        x.stop_gradient = False
+        y = paddle.rand([])
+        y.stop_gradient = False
+        out = paddle.inner(x, y)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        # 1) input is 1D
+        x = paddle.rand([2])
+        x.stop_gradient = False
+        y = paddle.rand([2])
+        y.stop_gradient = False
+        out = paddle.inner(x, y)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.grad.shape, [2])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        # 2) input is 2D
+        x = paddle.rand([2, 3])
+        x.stop_gradient = False
+        y = paddle.rand([3, 3])
+        y.stop_gradient = False
+        out = paddle.inner(x, y)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.grad.shape, [2, 3])
+        self.assertEqual(out.shape, [2, 3])
+        self.assertEqual(out.grad.shape, [2, 3])
+
+    def test_tensordot(self):
+        # 1) input is 1D
+        x = paddle.arange(10, dtype='float64')
+        x.stop_gradient = False
+        y = paddle.arange(10, dtype='float64')
+        y.stop_gradient = False
+        out = paddle.tensordot(x, y, axes=1)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.grad.shape, [10])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        # 2) input is 2D
+        x = paddle.arange(6, dtype='float64').reshape([2, 3])
+        y = paddle.arange(6, dtype='float64').reshape([2, 3])
+        x.stop_gradient = False
+        out = paddle.tensordot(x, y, axes=2)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(x.grad.shape, [2, 3])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+    def test_metric_accuracy(self):
+        x = paddle.full(shape=[2, 4], fill_value=0.25)
+        y = paddle.full(shape=[2, 1], fill_value=1, dtype="int64")
+        out = paddle.metric.accuracy(input=x, label=y, k=1)
+        self.assertEqual(out.shape, [])
+
+    def test_std(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out1 = paddle.std(x)
+        out2 = paddle.std(x, [])
+        out1.backward()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out1, 0)
+        self.assertEqual(out2, 0)
+
+        self.assertEqual(x.grad.shape, [])
+
+        # 2) x is ND
+        x = paddle.rand([3, 5])
+        x.stop_gradient = False
+        out = paddle.std(x)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [3, 5])
+
+    def test_var(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out1 = paddle.var(x)
+        out2 = paddle.var(x, [])
+        out1.backward()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out1, 0)
+        self.assertEqual(out2, 0)
+
+        self.assertEqual(x.grad.shape, [])
+        np.testing.assert_allclose(x.grad, 0)
+
+        # 2) x is ND
+        x = paddle.rand([3, 5])
+        x.stop_gradient = False
+        out = paddle.std(x)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [3, 5])
+
+    def test_quantile(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.quantile(x, 0.5, axis=None)
+
+        out.retain_grads()
+        out.backward()
+
+        out_empty_list = paddle.quantile(x, 0.5, axis=[])
+        self.assertEqual(out_empty_list, out)
+
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out, x)
+
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad, 1.0)
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(out.grad, 1.0)
+
+        # 2) x is ND
+        x = paddle.rand([2, 3])
+        x.stop_gradient = False
+        out = paddle.quantile(x, 0.5, axis=None)
+
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(out.grad, 1.0)
+        self.assertEqual(x.grad.shape, [2, 3])
+
+    def test_nanquantile(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.quantile(x, 0.5, axis=None)
+
+        out.retain_grads()
+        out.backward()
+
+        out_empty_list = paddle.quantile(x, 0.5, axis=[])
+        self.assertEqual(out_empty_list, out)
+
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out, x)
+
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad, 1.0)
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(out.grad, 1.0)
+
+        # 2) x is ND with 'nan'
+        x = paddle.to_tensor([[float('nan'), 2.0, 3.0], [0.0, 1.0, 2.0]])
+        x.stop_gradient = False
+        out = paddle.quantile(x, 0.5, axis=None)
+
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(out.grad, 1.0)
+        self.assertEqual(x.grad.shape, [2, 3])
+
+    def test_flip(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.flip(x, axis=[])
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+    def test_linear(self):
+        x = paddle.randn([3, 2])
+        w = paddle.full(shape=[2, 4], fill_value=0.5)
+        b = paddle.zeros([])
+
+        np.testing.assert_array_equal(
+            F.linear(x, w, b).numpy(), F.linear(x, w).numpy()
+        )
+
+    def test_is_complex(self):
+        x = paddle.rand([]) + 1j * paddle.rand([])
+        self.assertTrue(paddle.is_complex(x))
+
+    def test_is_floating_point(self):
+        self.assertTrue(paddle.is_floating_point(self.x))
+
+    def test_is_integer(self):
+        x = paddle.randint(0, 10, [])
+        self.assertTrue(paddle.is_integer(x))
+
+    def test_is_tensor(self):
+        self.assertTrue(paddle.is_tensor(self.x))
+
+    def test_isfinite(self):
+        out = paddle.isfinite(self.x)
+        np.testing.assert_array_equal(out.numpy(), np.array(True))
+
+    def test_isinf(self):
+        x = paddle.to_tensor(np.array(float('-inf')))
+        out = paddle.isinf(x)
+        np.testing.assert_array_equal(out.numpy(), np.array(True))
+
+    def test_isnan(self):
+        x = paddle.to_tensor(np.array(float('nan')))
+        out = paddle.isnan(x)
+        np.testing.assert_array_equal(out.numpy(), np.array(True))
+
+    def test_isclose(self):
+        out = paddle.isclose(self.x, self.x)
+        np.testing.assert_array_equal(out.numpy(), np.array(True))
+
+    def test_clone(self):
+        out = paddle.clone(self.x)
+        np.testing.assert_array_equal(out.numpy(), self.x.numpy())
+
+    def test_assign(self):
+        out = paddle.assign(self.x)
+        np.testing.assert_array_equal(out.numpy(), self.x.numpy())
+
+    def test_item(self):
+        x = paddle.full([], 0.5)
+        self.assertEqual(x.item(), 0.5)
+
+    def test_tolist(self):
+        x = paddle.full([], 0.5)
+        self.assertEqual(x.tolist(), 0.5)
+
+    def test_numpy(self):
+        x = paddle.full([], 0.5)
+        x_np = x.numpy()
+        np.testing.assert_array_equal(x_np.shape, ())
+        np.testing.assert_array_equal(x_np, np.array(0.5))
+
+        x_np = x.numpy(False)
+        np.testing.assert_array_equal(x_np.shape, ())
+        np.testing.assert_array_equal(x_np, np.array(0.5))
+
+    def test_numel(self):
+        # 1) x is 0D
+        out = paddle.numel(self.x)
+        self.assertEqual(out.shape, [])
+        np.testing.assert_array_equal(out.numpy(), np.array(1))
+
+        # 2) x is ND
+        x = paddle.full([3, 5], 0.5)
+        out = paddle.numel(x)
+        self.assertEqual(out.shape, [])
+        np.testing.assert_array_equal(out.numpy(), np.array(15))
+
+    def test_rank(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        out = paddle.rank(x)
+        self.assertEqual(out.shape, [])
+        np.testing.assert_array_equal(out.numpy(), np.array(0))
+
+        # 1) x is ND
+        x = paddle.full([3, 5], 0.5)
+        out = paddle.rank(x)
+        self.assertEqual(out.shape, [])
+        np.testing.assert_array_equal(out.numpy(), np.array(2))
+
+    def test_shape(self):
+        out = paddle.shape(self.x)
+        np.testing.assert_array_equal(out.numpy(), np.array([]))
+        self.assertEqual(out.shape, [0])
+
+    def test_equal_scalar(self):
+        x = paddle.rand([])
+        out = paddle.equal(x, 2.0)
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out, False)
+
+        x1 = paddle.full([], 2.0)
+        out1 = paddle.equal(x1, 2.0)
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1, True)
+
+    def test_pow_scalar(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.pow(x, 2.0)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_cast(self):
+        x = paddle.full([], 1.0, 'float32')
+        x.stop_gradient = False
+        out = paddle.cast(x, 'int32')
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_cumprod(self):
+        x = paddle.full([], 1.0, 'float32')
+        x.stop_gradient = False
+        out = paddle.cumprod(x, 0)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+        with self.assertRaises(ValueError):
+            tmp = paddle.cumprod(x, 2)
+
+    def test_clip(self):
+        x = paddle.uniform([], None, -10, 10)
+        x.stop_gradient = False
+        out = paddle.clip(x, -5, 5)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+        x1 = paddle.uniform([], None, -10, 10)
+        x1.stop_gradient = False
+        out1 = paddle.clip(x1, paddle.full([], -5.0), paddle.full([], 5.0))
+        out1.retain_grads()
+        out1.backward()
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1.grad.shape, [])
+        self.assertEqual(x1.grad.shape, [])
+
+    def test_increment(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.increment(x, 1.0)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_bitwise_not(self):
+        x = paddle.randint(-1, 1, [])
+        out1 = ~x
+        out2 = paddle.bitwise_not(x)
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+
+    def test_logical_not(self):
+        x = paddle.randint(0, 1, [])
+        out = paddle.logical_not(x)
+
+        self.assertEqual(out.shape, [])
+
+    def test_searchsorted(self):
+        # have no backward
+        x = paddle.to_tensor([1, 3, 5, 7, 9])
+        y = paddle.rand([])
+
+        out = paddle.searchsorted(x, y)
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.numpy(), 0)
+
+    def test_transpose(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.transpose(x, [])
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out, x)
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad, 1.0)
+
+        with self.assertRaises(ValueError):
+            x = paddle.transpose(x, [0])
+
+    def test_moveaxis(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.moveaxis(x, [], [])
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out, x)
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad, 1.0)
+
+        with self.assertRaises(AssertionError):
+            x = paddle.moveaxis(x, [1], [0])
+
+    def test_gather_1D(self):
+        x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
+        index = paddle.full([], 2, 'int64')
+        out = paddle.gather(x, index)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.numpy(), 5)
+        self.assertEqual(x.grad.shape, [5])
+        self.assertEqual(out.grad.shape, [])
+
+    def test_gather_xD_axis_0(self):
+        x = paddle.to_tensor(
+            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
+        )
+        index = paddle.full([], 1, 'int64')
+        out = paddle.gather(x, index)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [3])
+        np.testing.assert_array_equal(out.numpy(), x.numpy()[1, :])
+        self.assertEqual(x.grad.shape, [2, 3])
+        self.assertEqual(out.grad.shape, [3])
+
+    def test_gather_xD_axis_1(self):
+        x = paddle.to_tensor(
+            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
+        )
+        index = paddle.full([], 1, 'int64')
+        out = paddle.gather(x, index, axis=1)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [2])
+        np.testing.assert_array_equal(out.numpy(), [2.0, 5.0])
+        self.assertEqual(x.grad.shape, [2, 3])
+        self.assertEqual(out.grad.shape, [2])
+
+    def test_gather_nd(self):
+        x1 = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
+        x2 = paddle.to_tensor(
+            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
+        )
+
+        index1 = paddle.full([1], 1, 'int64')
+        index2 = paddle.full([2], 1, 'int64')
+
+        out1 = paddle.gather_nd(x1, index1)
+        out2 = paddle.gather_nd(x2, index2)
+
+        out1.retain_grads()
+        out2.retain_grads()
+
+        out1.backward()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        np.testing.assert_array_equal(out1, np.array(3.0))
+        np.testing.assert_array_equal(out2, np.array(5.0))
+        self.assertEqual(x1.grad.shape, [5])
+        self.assertEqual(x2.grad.shape, [2, 3])
+        self.assertEqual(out1.grad.shape, [])
+        self.assertEqual(out2.grad.shape, [])
+
+    def test_einsum(self):
+        os.environ['FLAGS_new_einsum'] = "0"
+        x = paddle.rand([5])
+        # sum
+        out1 = paddle.einsum('i->', x)
+        expect1 = np.einsum('i->', x)
+        # dot
+        out2 = paddle.einsum('i,i->', x, x)
+        expect2 = np.einsum('i,i->', x, x)
+
+        out1.retain_grads()
+        out2.retain_grads()
+
+        out1.backward()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        np.testing.assert_allclose(out1, expect1, rtol=1e-03)
+        np.testing.assert_allclose(out2, expect2, rtol=1e-03)
+
+    def test_einsum_V2(self):
+        os.environ['FLAGS_new_einsum'] = "1"
+        x = paddle.rand([5])
+        # sum
+        out1 = paddle.einsum('i->', x)
+        expect1 = np.einsum('i->', x)
+        # dot
+        out2 = paddle.einsum('i,i->', x, x)
+        expect2 = np.einsum('i,i->', x, x)
+
+        out1.retain_grads()
+        out2.retain_grads()
+
+        out1.backward()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        np.testing.assert_allclose(out1, expect1, rtol=1e-03)
+        np.testing.assert_allclose(out2, expect2, rtol=1e-03)
+
+    def test_scatter_1D(self):
+        x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
+        index = paddle.full([], 2, 'int64')
+        updates = paddle.full([], 4.0)
+        out = paddle.scatter(x, index, updates)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [5])
+        self.assertEqual(out.numpy()[2], 4)
+        self.assertEqual(out.grad.shape, [5])
+
+    def test_scatter_XD(self):
+        x = paddle.to_tensor(
+            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
+        )
+        index = paddle.full([], 1, 'int64')
+        updates = paddle.to_tensor([1.0, 2.0, 3.0])
+        out = paddle.scatter(x, index, updates)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [2, 3])
+        np.testing.assert_array_equal(out.numpy()[1], [1.0, 2.0, 3.0])
+        self.assertEqual(out.grad.shape, [2, 3])
+
+    def test_scatter_shape_check(self):
+        x = paddle.to_tensor([1.0, 2.0, 3.0])
+        index = paddle.to_tensor(1)
+        updates = paddle.to_tensor([3.0])
+        with self.assertRaises(ValueError):
+            out = paddle.scatter(x, index, updates)
+
+        x = paddle.to_tensor([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]])
+        index = paddle.to_tensor(1)
+        updates = paddle.to_tensor([[5.0, 5.0]])
+        with self.assertRaises(ValueError):
+            out = paddle.scatter(x, index, updates)
+
+    def test_scatter_0D_index(self):
+        x = paddle.to_tensor([1.0, 2.0, 3.0], stop_gradient=False)
+        index = paddle.to_tensor(1)
+        updates = paddle.to_tensor(3.0)
+        out = paddle.scatter(x, index, updates)
+        out.backward()
+        np.testing.assert_array_equal(x.grad.numpy()[1], 0.0)
+
+        x = paddle.to_tensor(
+            [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], stop_gradient=False
+        )
+        index = paddle.to_tensor(1)
+        updates = paddle.to_tensor([5.0, 5.0])
+        out = paddle.scatter(x, index, updates)
+        out.backward()
+        np.testing.assert_array_equal(x.grad.numpy()[1], [0.0, 0.0])
+
+    def test_diagflat(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x3 = paddle.rand([])
+
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+
+        x1.retain_grads()
+        x2.retain_grads()
+        x3.retain_grads()
+
+        out1 = paddle.diagflat(x1, 1)
+        out2 = paddle.diagflat(x2, -1)
+        out3 = paddle.diagflat(x3, 0)
+
+        out1.retain_grads()
+        out2.retain_grads()
+        out3.retain_grads()
+
+        out1.backward()
+        out2.backward()
+        out3.backward()
+
+        self.assertEqual(out1.shape, [2, 2])
+        self.assertEqual(out2.shape, [2, 2])
+        self.assertEqual(out3.shape, [1, 1])
+
+        self.assertEqual(out1.grad.shape, [2, 2])
+        self.assertEqual(out2.grad.shape, [2, 2])
+        self.assertEqual(out3.grad.shape, [1, 1])
+
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x3.grad.shape, [])
+
+    def test_scatter__1D(self):
+        x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0])
+        index = paddle.full([], 2, 'int64')
+        updates = paddle.full([], 4.0)
+        out = paddle.scatter_(x, index, updates)
+
+        self.assertEqual(out.numpy()[2], 4)
+
+    def test_scatter__XD(self):
+        x = paddle.to_tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+        index = paddle.full([], 1, 'int64')
+        updates = paddle.to_tensor([1.0, 2.0, 3.0])
+        out = paddle.scatter_(x, index, updates)
+        np.testing.assert_array_equal(out.numpy()[1], [1.0, 2.0, 3.0])
+
+    def test_scatter_nd(self):
+        index = paddle.to_tensor([3], dtype="int64")
+        updates = paddle.full([], 2, dtype='float32')
+        updates.retain_grads()
+        updates.stop_gradient = False
+
+        out = paddle.scatter_nd(index, updates, [5])
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [5])
+        self.assertEqual(out.numpy()[3], 2)
+        self.assertEqual(out.grad.shape, [5])
+        self.assertEqual(updates.grad.shape, [])
+
+    def test_flatten(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        start_axis = 0
+        stop_axis = -1
+
+        out = paddle.flatten(x, start_axis=start_axis, stop_axis=stop_axis)
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_histogram(self):
+        x = paddle.rand([])
+        out = paddle.histogram(x, bins=5, min=1, max=5)
+        self.assertEqual(out.shape, [5])
+
+    def test_scale(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.scale(x, scale=2.0, bias=1.0)
+
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_scale_(self):
+        x = paddle.rand([])
+        out = x.scale_(scale=2.0, bias=1.0)
+        self.assertEqual(out.shape, [])
+
+    def test_floor_divide(self):
+        # 1-d // 0-d
+        x = paddle.to_tensor([1, -2, 3], dtype="int64")
+        y = paddle.full([], 2, dtype='int64')
+        out1_1 = paddle.floor_divide(x, y)
+        out1_2 = paddle.Tensor.__floordiv__(x, y)
+
+        np.testing.assert_array_equal(out1_1.numpy(), out1_2.numpy())
+        np.testing.assert_array_equal(out1_1.numpy(), np.asarray([0, -1, 1]))
+
+        # 0-d // 1-d
+        out2_1 = paddle.floor_divide(y, x)
+        out2_2 = paddle.Tensor.__floordiv__(y, x)
+
+        np.testing.assert_array_equal(out2_1.numpy(), out2_2.numpy())
+        np.testing.assert_array_equal(out2_2.numpy(), np.asarray([2, -1, 0]))
+
+        # 0-d // 0-d
+        x = paddle.full([], 3, dtype='int64')
+        out3_1 = paddle.floor_divide(x, y)
+        out3_2 = paddle.Tensor.__floordiv__(x, y)
+
+        np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy())
+        np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1))
+
+    def test_cumsum(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+
+        out1 = paddle.cumsum(x1)
+        out2 = paddle.cumsum(x1, axis=0)
+        out3 = paddle.cumsum(x1, axis=-1)
+
+        out1.retain_grads()
+        out2.retain_grads()
+        out3.retain_grads()
+
+        out1.backward()
+        out2.backward()
+        out3.backward()
+
+        self.assertEqual(x1.grad.shape, [])
+        self.assertTrue(x1.grad.numpy() == 3)
+        self.assertEqual(out1.shape, [1])
+        self.assertEqual(out1.grad.shape, [1])
+        self.assertTrue(out1.grad.numpy() == 1)
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out2.grad.shape, [])
+        self.assertTrue(out2.grad.numpy() == 1)
+        self.assertEqual(out3.shape, [])
+        self.assertEqual(out3.grad.shape, [])
+        self.assertTrue(out3.grad.numpy() == 1)
+
+    def test_logcumsumexp(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out1 = paddle.logcumsumexp(x)
+        out2 = paddle.logcumsumexp(x, axis=0)
+        out3 = paddle.logcumsumexp(x, axis=-1)
+
+        out1.backward()
+        out2.backward()
+        out3.backward()
+
+        self.assertEqual(out1.shape, [1])
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out3.shape, [])
+
+        self.assertEqual(x.grad.shape, [])
+        self.assertTrue(x.grad.numpy() == 3)
+
+    def test_add_n(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+        x2 = paddle.rand([])
+        x2.stop_gradient = False
+        x3 = paddle.rand([])
+        x3.stop_gradient = False
+
+        out1 = paddle.add_n(x1)
+        out2 = paddle.add_n([x2, x3])
+
+        out1.retain_grads()
+        out2.retain_grads()
+
+        out1.backward()
+        out2.backward()
+
+        self.assertEqual(x1.grad.shape, [])
+        self.assertTrue(x1.grad.numpy() == 1)
+        self.assertEqual(x2.grad.shape, [])
+        self.assertTrue(x2.grad.numpy() == 1)
+        self.assertEqual(x3.grad.shape, [])
+        self.assertTrue(x3.grad.numpy() == 1)
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1.grad.shape, [])
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out2.grad.shape, [])
+
+    def test_reshape_list(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.reshape(x, [])
+
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        out = paddle.reshape(x, [1])
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1])
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        out = paddle.reshape(x, [-1, 1])
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape_tensor(self):
+        x = paddle.rand([1, 1])
+        x.stop_gradient = False
+        out = paddle.reshape(x, [])
+
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+        new_shape = paddle.to_tensor([1, 1, 1], "int32")
+        out = paddle.reshape(x, new_shape)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1, 1, 1])
+        self.assertEqual(out.grad.shape, [1, 1, 1])
+
+        new_shape = paddle.to_tensor([-1], "int32")
+        out = paddle.reshape(x, new_shape)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1])
+        self.assertEqual(out.grad.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape(x, new_shape)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.grad.shape, [1, 1])
+        self.assertEqual(out.shape, [1, 1])
+        self.assertEqual(out.grad.shape, [1, 1])
+
+    def test_reshape__list(self):
+        x = paddle.rand([])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        out = paddle.reshape_(x, [1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1])
+        self.assertEqual(out.shape, [1])
+
+        out = paddle.reshape_(x, [-1, 1])
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_reshape__tensor(self):
+        x = paddle.rand([1, 1])
+        out = paddle.reshape_(x, [])
+        self.assertEqual(out.shape, [])
+
+        new_shape = paddle.full([1], 1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = paddle.full([1], -1, "int32")
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1])
+
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out = paddle.reshape_(x, new_shape)
+        self.assertEqual(out.shape, [1, 1])
+
+    def test_reverse(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.reverse(x, axis=[])
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(x.shape, [])
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+
+    def test_sort(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x1.retain_grads()
+        x2.retain_grads()
+        out1 = paddle.sort(x1, axis=-1)
+        out2 = paddle.sort(x2, axis=0)
+
+        out1.retain_grads()
+        out2.retain_grads()
+
+        out1.backward()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out1.numpy(), x1.numpy())
+        self.assertEqual(out2.numpy(), x2.numpy())
+        self.assertEqual(out1.grad.shape, [])
+        self.assertEqual(out2.grad.shape, [])
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x1.grad.numpy(), 1)
+        self.assertEqual(x2.grad.numpy(), 1)
+
+    def test_argsort(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x1.retain_grads()
+        x2.retain_grads()
+
+        out1 = paddle.argsort(x1, axis=-1)
+        out2 = paddle.argsort(x2, axis=0)
+
+        out1.retain_grads()
+        out2.retain_grads()
+
+        out1.backward()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out1.numpy(), 0)
+        self.assertEqual(out2.numpy(), 0)
+        self.assertEqual(out1.grad.shape, [])
+        self.assertEqual(out2.grad.shape, [])
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x1.grad.numpy(), 0)
+        self.assertEqual(x2.grad.numpy(), 0)
+
+    def test_lerp(self):
+        # 0D + 0D, weight is float scalar
+        x = paddle.rand([])
+        y = paddle.rand([])
+        x.stop_gradient = False
+        y.stop_gradient = False
+        out = paddle.lerp(x, y, 0.5)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(y.grad.shape, [])
+
+        # 0D + 0D, weigh is 0D
+        x0 = paddle.rand([])
+        y0 = paddle.rand([])
+        w0 = paddle.rand([])
+        x0.stop_gradient = False
+        y0.stop_gradient = False
+        y0.retain_grads()
+
+        out0 = paddle.lerp(x0, y0, w0)
+        out0.backward()
+
+        self.assertEqual(out0.shape, [])
+        self.assertEqual(x0.grad.shape, [])
+        self.assertEqual(y0.grad.shape, [])
+
+        # 0D + ND
+        x1 = paddle.rand([])
+        y1 = paddle.rand([64, 64])
+        w1 = paddle.rand([])
+        x1.stop_gradient = False
+        y1.stop_gradient = False
+        x1.retain_grads()
+        y1.retain_grads()
+
+        out1 = paddle.lerp(x1, y1, w1)
+        out1.backward()
+
+        self.assertEqual(out1.shape, [64, 64])
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(y1.grad.shape, [64, 64])
+
+        # ND + 0D
+        x2 = paddle.rand([64, 64])
+        y2 = paddle.rand([])
+        w2 = paddle.rand([])
+        x2.stop_gradient = False
+        y2.stop_gradient = False
+        x2.retain_grads()
+        y2.retain_grads()
+
+        out2 = paddle.lerp(x2, y2, w2)
+        out2.backward()
+
+        self.assertEqual(out2.shape, [64, 64])
+        self.assertEqual(x2.grad.shape, [64, 64])
+        self.assertEqual(y2.grad.shape, [])
+
+    def test_repeat_interleave(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+
+            x = paddle.randn(())
+            x.stop_gradient = False
+
+            out = paddle.repeat_interleave(x, 2, None)
+            out.backward()
+
+            # check shape of output
+            self.assertEqual(out.shape, [2])
+
+            # check grad shape
+            self.assertEqual(x.grad.shape, [])
+
+            repeats = paddle.to_tensor([3], dtype='int32')
+            out = paddle.repeat_interleave(x, repeats, None)
+
+            # check shape of output with 1D repeats
+            self.assertEqual(out.shape, [3])
+
+            # check grad shape with 1D repeats
+            self.assertEqual(x.grad.shape, [])
+
+    def test_allclose(self):
+        # 1) x is 0D
+        x = paddle.full([], 0.5)
+        y = paddle.full([], 0.6)
+        out = paddle.allclose(x, y)
+        self.assertEqual(out.shape, [])
+        self.assertFalse(out)
+
+        # 2) x is ND
+        x = paddle.full([2, 3], 0.5)
+        y = paddle.full([2, 3], 0.6)
+        out = paddle.allclose(x, y)
+        self.assertEqual(out.shape, [])
+        self.assertFalse(out)
+
+    def test_equal_all(self):
+        # 1) x is 0D
+        x = paddle.full([], 0.5)
+        y = paddle.full([], 0.6)
+        out = paddle.equal_all(x, y)
+        self.assertEqual(out.shape, [])
+        self.assertFalse(out)
+
+        # 2) x is ND
+        x = paddle.full([2, 3], 0.5)
+        y = paddle.full([2, 3], 0.6)
+        out = paddle.equal_all(x, y)
+        self.assertEqual(out.shape, [])
+        self.assertFalse(out)
+
+    def test_where(self):
+        x1 = paddle.full([], 1)
+        x2 = paddle.full([], 2)
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x1.retain_grads()
+        x2.retain_grads()
+        out = paddle.where(x1 > x2, x1, x2)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.numpy(), 2)
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x1.grad.numpy(), 0)
+        self.assertEqual(x2.grad.numpy(), 1)
+
+    def test_atan2(self):
+        x1 = paddle.full([], 0)
+        x2 = paddle.full([], 2)
+        x1.retain_grads()
+        x2.retain_grads()
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        out = paddle.atan2(x1, x2)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.numpy(), 0)
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x1.grad.numpy(), 0.5)
+        self.assertEqual(x2.grad.numpy(), 0)
+
+    def test_interpolate(self):
+        from paddle.nn.functional import interpolate
+
+        input_x = paddle.rand([2, 3, 6, 6])
+        input_x.stop_gradient = False
+        origin_result = interpolate(
+            x=input_x, size=[12, 12], mode="bilinear", align_corners=False
+        )
+
+        output_size = [
+            paddle.full([], 12, dtype="int32"),
+            paddle.full([], 12, dtype="int32"),
+        ]
+        out1 = interpolate(
+            x=input_x, size=output_size, mode="bilinear", align_corners=False
+        )
+        out1.backward()
+
+        self.assertEqual(out1.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        scale_1 = [paddle.full([], 2), paddle.full([], 2)]
+        out2 = interpolate(
+            x=input_x,
+            scale_factor=scale_1,
+            mode="bilinear",
+            align_corners=False,
+        )
+        out2.backward()
+
+        self.assertEqual(out2.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        scale_2 = paddle.full([], 2)
+        out3 = interpolate(
+            x=input_x,
+            scale_factor=scale_2,
+            mode="bilinear",
+            align_corners=False,
+        )
+        out3.backward()
+
+        # for coverage
+        scale_3 = paddle.full([1], 2)
+        input_3d = paddle.rand([2, 3, 6])
+        out4 = interpolate(
+            x=input_3d,
+            scale_factor=scale_3,
+            mode="LINEAR",
+            align_corners=False,
+            data_format="NCW",
+        )
+
+        self.assertEqual(out3.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+        np.testing.assert_allclose(
+            origin_result.numpy(), out1.numpy(), rtol=1e-05
+        )
+        np.testing.assert_allclose(
+            origin_result.numpy(), out2.numpy(), rtol=1e-05
+        )
+        np.testing.assert_allclose(
+            origin_result.numpy(), out3.numpy(), rtol=1e-05
+        )
+
+    def test_upsample(self):
+        from paddle.nn.functional import upsample
+
+        input_x = paddle.rand([2, 3, 6, 6])
+        input_x.stop_gradient = False
+
+        output_size = [
+            paddle.full([], 12, dtype="int32"),
+            paddle.full([], 12, dtype="int32"),
+        ]
+        out1 = upsample(
+            x=input_x, size=output_size, mode="bilinear", align_corners=False
+        )
+        out1.backward()
+
+        self.assertEqual(out1.shape, [2, 3, 12, 12])
+        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
+
+    def test_unstack(self):
+        x1 = paddle.full([1], 0)
+        x2 = paddle.full([2], 2)
+        x1.retain_grads()
+        x2.retain_grads()
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+
+        [out1] = paddle.unstack(x1, 0)
+        out1.retain_grads()
+        out1.backward()
+        [out2_1, out2_2] = paddle.unstack(x2, 0)
+        out2 = paddle.add_n([out2_1, out2_2])
+        out2.retain_grads()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1.numpy(), 0)
+
+        self.assertEqual(out2_1.shape, [])
+        self.assertEqual(out2_1.numpy(), 2)
+        self.assertEqual(out2_2.shape, [])
+        self.assertEqual(out2_2.numpy(), 2)
+        self.assertEqual(x2.grad.shape, [2])
+
+    def test_unbind(self):
+        x1 = paddle.full([1], 0)
+        x2 = paddle.full([2], 2)
+        x1.retain_grads()
+        x2.retain_grads()
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+
+        [out1] = paddle.unbind(x1, 0)
+        out1.retain_grads()
+        out1.backward()
+        [out2_1, out2_2] = paddle.unbind(x2, 0)
+        out2 = paddle.add_n([out2_1, out2_2])
+        out2.retain_grads()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1.numpy(), 0)
+
+        self.assertEqual(out2_1.shape, [])
+        self.assertEqual(out2_1.numpy(), 2)
+        self.assertEqual(out2_2.shape, [])
+        self.assertEqual(out2_2.numpy(), 2)
+        self.assertEqual(x2.grad.shape, [2])
+
+    def test_masked_select(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        mask = paddle.full([], True, dtype='bool')
+        y = paddle.masked_select(x, mask)
+
+        y.retain_grads()
+        y.backward()
+        self.assertEqual(y.shape, [1])
+        self.assertEqual(y.numpy(), x.numpy())
+        self.assertEqual(y.grad.shape, [1])
+        self.assertEqual(x.grad.shape, [])
+        self.assertEqual(x.grad.numpy(), 1)
+
+    def test_squeeze(self):
+        x1 = paddle.full([], 2)
+        x1.stop_gradient = False
+        x1.retain_grads()
+        out1 = paddle.squeeze(x1, axis=0)
+        out1.retain_grads()
+        out1.backward()
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(x1.grad.shape, [])
+
+        x2 = paddle.full([], 3)
+        x3 = paddle.full([1], 0, dtype='int32')
+        x2.stop_gradient = False
+        x2.retain_grads()
+        out2 = paddle.squeeze(x2, axis=x3)
+        out2.retain_grads()
+        out2.backward()
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+
+    def test_unsqueeze(self):
+        x1 = paddle.full([], 2)
+        x1.stop_gradient = False
+        x1.retain_grads()
+        out1 = paddle.unsqueeze(x1, axis=0)
+        out1.retain_grads()
+        out1.backward()
+        self.assertEqual(out1.shape, [1])
+        self.assertEqual(x1.grad.shape, [])
+
+        x2 = paddle.full([], 0, dtype='int32')
+        out2 = paddle.unsqueeze(x1, axis=x2)
+        out2.retain_grads()
+        out2.backward()
+        self.assertEqual(out2.shape, [1])
+        self.assertEqual(x1.grad.shape, [])
+
+    def test_t(self):
+        x = paddle.full([], 2.0)
+        x.stop_gradient = False
+        x.retain_grads()
+        out = paddle.t(x)
+        out.retain_grads()
+        out.backward()
+        self.assertEqual(out.shape, [])
+        self.assertEqual(out.grad.shape, [])
+        self.assertEqual(x.grad.shape, [])
+
+    def test_prelu(self):
+        x1 = paddle.full([], 1.0, 'float32')
+        x1.stop_gradient = False
+        w1 = paddle.full([], 0.25, dtype='float32')
+        out1 = paddle.nn.functional.prelu(x1, w1)
+        out1.retain_grads()
+        out1.backward()
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1.numpy(), 1.0)
+        self.assertEqual(out1.grad.shape, [])
+        self.assertEqual(x1.grad.shape, [])
+        self.assertEqual(x1.grad.numpy(), 1.0)
+
+        x2 = paddle.full([], -1.0, 'float32')
+        x2.stop_gradient = False
+        w2 = paddle.full([], 0.25, dtype='float32')
+        out2 = paddle.nn.functional.prelu(x2, w2)
+        out2.retain_grads()
+        out2.backward()
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out2.numpy(), -0.25)
+        self.assertEqual(out2.grad.shape, [])
+        self.assertEqual(x2.grad.shape, [])
+        self.assertEqual(x2.grad.numpy(), 0.25)
+
+    def test_while_loop(self):
+        def cond(i, x):
+            return paddle.less_than(i, eleven)
+
+        def body(i, x):
+            x = x + i
+            i = i + 1
+            return [i, x]
+
+        i = paddle.full([], 1.0, dtype='float32')
+        i.stop_gradient = False
+        i.persistable = True
+        eleven = paddle.full([], 11, dtype='float32')
+        x = paddle.full([], 0.0, dtype='float32')
+        x.stop_gradient = False
+        x.persistable = True
+        out_i, out_x = paddle.static.nn.while_loop(cond, body, [i, x])
+
+        if in_dynamic_mode():
+            out_x.backward()
+            di = i.grad
+            dx = x.grad
+        else:
+            grad_list = paddle.static.append_backward(out_x)
+            for p, g in grad_list:
+                if p.is_same(i):
+                    di = g
+                elif p.is_same(x):
+                    dx = g
+            place = (
+                base.CUDAPlace(0)
+                if core.is_compiled_with_cuda()
+                else base.CPUPlace()
+            )
+            exe = base.Executor(place)
+            main_program = paddle.static.default_main_program()
+            out_i, out_x, di, dx = exe.run(
+                main_program, feed={}, fetch_list=[out_i, out_x, di, dx]
+            )
+
+        self.assertEqual(np.asarray(out_i).shape, ())
+        np.testing.assert_allclose(out_i, np.array(11))
+        self.assertEqual(np.asarray(out_x).shape, ())
+        np.testing.assert_allclose(out_x, np.array(55))
+        self.assertEqual(np.asarray(di).shape, ())
+        np.testing.assert_allclose(di, np.array(10))
+        self.assertEqual(np.asarray(dx).shape, ())
+        np.testing.assert_allclose(dx, np.array(1.0))
+
+    def test_to_tensor(self):
+        out1 = paddle.to_tensor(1)
+        out2 = paddle.to_tensor(2.5)
+
+        out1.retain_grads()
+        out1.backward()
+        out2.retain_grads()
+        out2.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(out1, 1)
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(out2, 2.5)
+
+    def test_matmul(self):
+        # 1) no transpose
+        x = paddle.randn([10])
+        x.stop_gradient = False
+        y = paddle.randn([10])
+        y.stop_gradient = False
+        out1 = paddle.matmul(x, y)
+        out1.retain_grads()
+        out1.backward()
+
+        self.assertEqual(out1.shape, [])
+        self.assertEqual(x.grad.shape, [10])
+        self.assertEqual(y.grad.shape, [10])
+
+        # 2) transpose x and y
+        x = paddle.randn([10])
+        x.stop_gradient = False
+        y = paddle.randn([10])
+        y.stop_gradient = False
+        out2 = paddle.matmul(x, y, True, True)
+        out2.retain_grads()
+        out2.backward()
+
+        self.assertEqual(out2.shape, [])
+        self.assertEqual(x.grad.shape, [10])
+        self.assertEqual(y.grad.shape, [10])
+
+    def test_linalg_slogdet(self):
+        # 2-D input
+        x = paddle.randn([3, 3])
+        x.stop_gradient = False
+        out = paddle.linalg.slogdet(x)
+        out.retain_grads()
+        out.backward()
+
+        self.assertTrue(out.shape, [2])
+        self.assertTrue(x.grad.shape, [3, 3])
+
+        # 3-D input
+        x1 = paddle.randn([3, 3, 3])
+        x1.stop_gradient = False
+        out1 = paddle.linalg.slogdet(x1)
+        out1.retain_grads()
+        out1.backward()
+
+        self.assertTrue(out1.shape, [2, 3])
+        self.assertTrue(x1.grad.shape, [3, 3, 3])
+
+    def test_multi_dot(self):
+        a = paddle.randn([4])
+        a.stop_gradient = False
+        b = paddle.randn([4, 5])
+        b.stop_gradient = False
+        c = paddle.randn([5])
+        c.stop_gradient = False
+
+        out = paddle.linalg.multi_dot([a, b, c])
+        out.retain_grads()
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(a.grad.shape, [4])
+        self.assertEqual(b.grad.shape, [4, 5])
+        self.assertEqual(c.grad.shape, [5])
+
+    def test_cov(self):
+        xt = paddle.randn((3, 4))
+        xt.stop_gradient = False
+        xt_1 = paddle.randn((12,))
+        xt_1.stop_gradient = False
+
+        xt_out = paddle.linalg.cov(xt)
+        xt_out.retain_grads()
+        xt_out.backward()
+        self.assertEqual(xt_out.shape, [3, 3])
+        self.assertEqual(xt.grad.shape, [3, 4])
+
+        xt_1_out = paddle.linalg.cov(xt_1)
+        xt_1.retain_grads()
+        xt_1_out.backward()
+        self.assertEqual(xt_1_out.shape, [])
+        self.assertEqual(xt_1.grad.shape, [12])
+
+    def test_corrcoef(self):
+        x = paddle.randn((12,))
+        x.stop_gradient = False
+        out = paddle.linalg.corrcoef(x)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        self.assertEqual(x.grad.shape, [12])
+
+    def test_det(self):
+        xt = paddle.randn([3, 3, 3])
+        xt.stop_gradient = False
+        xt_1 = paddle.randn([3, 3])
+        xt_1.stop_gradient = False
+
+        xt_out = paddle.linalg.det(xt)
+        xt.retain_grads()
+        xt_out.backward()
+        self.assertEqual(xt_out.shape, [3])
+        self.assertEqual(xt.grad.shape, [3, 3, 3])
+
+        xt_1_out = paddle.linalg.det(xt_1)
+        xt_1.retain_grads()
+        xt_1_out.backward()
+        self.assertEqual(xt_1_out.shape, [])
+        self.assertEqual(xt_1.grad.shape, [3, 3])
+
+    def test_dist(self):
+        x = paddle.to_tensor([[3, 3], [3, 3]], dtype="float32")
+        y = paddle.to_tensor([[3, 3], [3, 1]], dtype="float32")
+        x.stop_gradient = False
+        y.stop_gradient = False
+        out = paddle.dist(x, y, 0)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        np.testing.assert_allclose(out, np.array(1))
+        self.assertEqual(x.grad.shape, [2, 2])
+        self.assertEqual(y.grad.shape, [2, 2])
+
+    def test_linalg_norm(self):
+        # 1D input, p = fro ,axis = None, using reduceInferMeta
+        x_1 = paddle.arange(24, dtype="float32") - 12
+        x_1.stop_gradient = False
+        out_1 = paddle.linalg.norm(x_1)
+        out_1.retain_grads()
+        out_1.backward()
+
+        self.assertEqual(out_1.shape, [])
+        self.assertTrue(x_1.grad.shape, [24])
+
+        # 1D input, p = 1 ,axis = None,
+        # using p_norm, as_vector = True
+        x_2 = paddle.arange(24, dtype="float32") - 12
+        x_2.stop_gradient = False
+        out_2 = paddle.linalg.norm(x_2, p=1)
+        out_2.retain_grads()
+        out_2.backward()
+
+        self.assertEqual(out_2.shape, [])
+        self.assertEqual(x_2.grad.shape, [24])
+
+        # 1D input, p = 1 ,axis = 0,
+        # using p_norm, as_vector = False
+        x_2_p = paddle.arange(24, dtype="float32") - 12
+        x_2_p.stop_gradient = False
+        out_2_p = paddle.linalg.norm(x_2_p, p=1, axis=0)
+        out_2_p.retain_grads()
+        out_2_p.backward()
+
+        self.assertEqual(out_2_p.shape, [])
+        self.assertEqual(x_2_p.grad.shape, [24])
+
+        # 1D input, p = fro ,axis = 0,
+        # using p_norm, as_vector = False
+        x_2_fro = paddle.arange(24, dtype="float32") - 12
+        x_2_fro.stop_gradient = False
+        out_2_fro = paddle.linalg.norm(x_2_fro, p="fro", axis=0)
+        out_2_fro.retain_grads()
+        out_2_fro.backward()
+
+        self.assertEqual(out_2_fro.shape, [])
+        self.assertEqual(x_2_fro.grad.shape, [24])
+
+        # 2D input, p = 1, axis = [0, 1]
+        # using p_matrix_norm ,depends on paddle.sum
+        x_3 = paddle.arange(24, dtype="float32").reshape([4, 6])
+        x_3.stop_gradient = False
+        out_3 = paddle.linalg.norm(x_3, p=1, axis=[0, 1])
+        out_3.retain_grads()
+        out_3.backward()
+        self.assertEqual(out_3.shape, [])
+        self.assertEqual(x_3.grad.shape, [4, 6])
+
+        # 2D input, p = 1, axis = None
+        # using p_matrix_norm, depends on paddle.sum
+        x_4 = paddle.arange(24, dtype="float32").reshape([4, 6])
+        x_4.stop_gradient = False
+        out_4 = paddle.linalg.norm(x_4)
+        out_4.retain_grads()
+        out_4.backward()
+        self.assertEqual(out_4.shape, [])
+        self.assertEqual(x_4.grad.shape, [4, 6])
+
+        # 2D input, p = inf, axis = [0, 1]
+        # using p_matrix_norm, depends on paddle.sum
+        x_5 = paddle.arange(24, dtype="float32").reshape([4, 6])
+        x_5.stop_gradient = False
+        out_5 = paddle.linalg.norm(x_5, p=2, axis=[0, 1])
+        out_5.retain_grads()
+        out_5.backward()
+
+        self.assertEqual(out_5.shape, [])
+        self.assertEqual(x_5.grad.shape, [4, 6])
+
+        # 2D input, p = -inf, axis = [0, 1]
+        x_6 = paddle.arange(24, dtype="float32").reshape([4, 6])
+        x_6.stop_gradient = False
+        out_6 = paddle.linalg.norm(x_6, p=-float("inf"), axis=[0, 1])
+        out_6.retain_grads()
+        out_6.backward()
+
+        self.assertEqual(out_6.shape, [])
+        self.assertEqual(x_6.grad.shape, [4, 6])
+
+    def test_linalg_cond(self):
+        def assert_shape(out):
+            self.assertEqual(out.shape, [])
+
+        x1 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x1.stop_gradient = False
+        # p = 2 : use paddle.sum
+        out = paddle.linalg.cond(x1)
+        out.backward()
+        assert_shape(out)
+        self.assertEqual(x1.grad.shape, [3, 3])
+
+        # p = fro : use paddle.sum
+        x2 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x2.stop_gradient = False
+        out_fro = paddle.linalg.cond(x2, p='fro')
+        out_fro.backward()
+        assert_shape(out_fro)
+        self.assertEqual(x2.grad.shape, [3, 3])
+
+        # p = nuc : use paddle.sum
+        x3 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x3.stop_gradient = False
+        out_nuc = paddle.linalg.cond(x3, p='nuc')
+        out_nuc.backward()
+        assert_shape(out_nuc)
+        self.assertEqual(x3.grad.shape, [3, 3])
+
+        # p in (-1, 1) : use paddle.sum
+        x4 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x4.stop_gradient = False
+        out_1 = paddle.linalg.cond(x4, p=1)
+        out_1.backward()
+        assert_shape(out_1)
+        self.assertEqual(x4.grad.shape, [3, 3])
+
+        x5 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x5.stop_gradient = False
+        out_minus_1 = paddle.linalg.cond(x5, p=-1)
+        out_minus_1.backward()
+        assert_shape(out_minus_1)
+        self.assertEqual(x5.grad.shape, [3, 3])
+
+        # p in (-2, 2)  depends on paddle.sum
+        x6 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x6.stop_gradient = False
+        out_2 = paddle.linalg.cond(x6, p=2)
+        out_2.backward()
+        assert_shape(out_2)
+        self.assertEqual(x6.grad.shape, [3, 3])
+
+        # p in (-inf, inf):use paddle.sum
+        x8 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x8.stop_gradient = False
+        out_inf = paddle.linalg.cond(x8, p=float("inf"))
+        out_inf.backward()
+        assert_shape(out_inf)
+        self.assertEqual(x8.grad.shape, [3, 3])
+
+        a = paddle.randn([2, 4, 4])
+        a.stop_gradient = False
+        a_cond_fro = paddle.linalg.cond(a, p='fro')
+        a_cond_fro.backward()
+        self.assertEqual(len(a_cond_fro.shape), 1)
+        self.assertEqual(a.grad.shape, [2, 4, 4])
+
+    def test_trace(self):
+        x = paddle.to_tensor([[3, 2], [1, 9]], dtype="float32")
+        x.stop_gradient = False
+        out = paddle.trace(x)
+        out.backward()
+
+        self.assertEqual(out.shape, [])
+        np.testing.assert_allclose(out, np.array(12))
+        self.assertEqual(x.grad.shape, [2, 2])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_sundry_static_api_part1.py b/test/legacy_test/test_zero_dim_sundry_static_api_part1.py
new file mode 100644
index 0000000000000..c8d5ef8bdc93f
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_sundry_static_api_part1.py
@@ -0,0 +1,916 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import numpy as np
+from decorator_helper import prog_scope
+
+import paddle
+from paddle.pir_utils import test_with_pir_api
+
+# Use to test zero-dim of Sundry API, which is unique and can not be classified
+# with others. It can be implemented here flexibly.
+
+
+class TestSundryAPIStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.exe = paddle.static.Executor()
+
+    def assertShapeEqual(self, out, target_tuple):
+        if not paddle.framework.in_pir_mode():
+            out_shape = list(out.shape)
+        else:
+            out_shape = out.shape
+        self.assertEqual(out_shape, target_tuple)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_polygamma(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.polygamma(x, 2)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x])
+        x_grad = grad_list[0][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_frexp(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out1, out2 = paddle.frexp(x)
+        grad_list = paddle.static.append_backward(out1, parameter_list=[x])
+        x_grad = grad_list[0][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out1, out2, x_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_pairwise_distance(self):
+        x = paddle.rand([5])
+        x.stop_gradient = False
+        y = paddle.rand([5])
+        y.stop_gradient = False
+
+        out = paddle.nn.functional.pairwise_distance(x, y)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, y])
+        x_grad, y_grad = (_grad for _param, _grad in grad_list)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad, y_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (5,))
+        self.assertEqual(res[2].shape, (5,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_take(self):
+        x1 = paddle.rand([4, 5])
+        x1.stop_gradient = False
+        out1 = paddle.take(x1, paddle.to_tensor(2))
+        x1_grad = paddle.static.append_backward(out1, parameter_list=[x1])
+        x1_grad = x1_grad[0][1]
+
+        x2 = paddle.rand([])
+        x2.stop_gradient = False
+        out2 = paddle.take(x2, paddle.to_tensor(0))
+        x2_grad = paddle.static.append_backward(out2, parameter_list=[x2])
+        x2_grad = x2_grad[0][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out1, x1_grad, out2, x2_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (4, 5))
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        np.testing.assert_allclose(res[3], 1.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_trapezoid(self):
+        y = paddle.rand([5])
+        y.stop_gradient = False
+        out = paddle.trapezoid(y, dx=2.0)
+        grad_list = paddle.static.append_backward(out, parameter_list=[y])
+        y_grad = grad_list[0][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, y_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (5,))
+
+    @prog_scope()
+    def test_create_parameter_var(self):
+        zero_dim_param = paddle.create_parameter(shape=[], dtype='float32')
+        self.assertShapeEqual(zero_dim_param, [])
+        prog = paddle.static.default_startup_program()
+        res = self.exe.run(prog, fetch_list=[zero_dim_param])
+        self.assertEqual(res[0].shape, ())
+
+        zero_dim_var = paddle.static.create_global_var(
+            shape=[], value=0.5, dtype='float32'
+        )
+        self.assertEqual(zero_dim_var.shape, ())
+        prog = paddle.static.default_startup_program()
+        res = self.exe.run(prog, fetch_list=[zero_dim_var])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 0.5)
+
+    @prog_scope()
+    def test_getitem(self):
+        # case1: When all axis have a scalar indice, output should be a 0-d Tensor;
+        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
+        x.stop_gradient = False
+        out = x[1, 2, 3, 4]
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        x_out_grad = [_grad for _param, _grad in grad_list]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + x_out_grad)
+
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_allclose(res[0], np.array(119))
+        self.assertEqual(res[2].shape, ())
+        np.testing.assert_allclose(res[2], 1.0)
+        self.assertEqual(res[1].shape, (2, 3, 4, 5))
+        x_grad_expected = np.zeros((2, 3, 4, 5))
+        x_grad_expected[1, 2, 3, 4] = 1.0
+        np.testing.assert_allclose(res[1], x_grad_expected)
+
+        # case2: When one axis has a 0-d Tensor indice, the output should be same as int indice.
+        x2 = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
+        out1 = x2[1, 2]
+        out2 = x2[
+            paddle.full([], 1, dtype='int32'), paddle.full([], 2, dtype='int32')
+        ]
+        res = self.exe.run(prog, fetch_list=[out1, out2])
+        np.testing.assert_allclose(res[0], res[1])
+
+        # case3: When all axis have a scalar indice (i.e. case1) and has None indice,
+        # ndim of output should be same with numbers of None.
+        x3 = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
+        out3 = x3[1, 2, None, 3, 4]
+        out4 = x3[1, None, 2, None, 3, 4]
+        res = self.exe.run(prog, fetch_list=[out3, out4])
+        self.assertEqual(res[0].shape, (1,))
+        np.testing.assert_allclose(res[0], np.array([119]))
+        self.assertEqual(res[1].shape, (1, 1))
+        np.testing.assert_allclose(res[1], np.array([[119]]))
+
+        # case4: 1-D Tensor will be treated as vector, no axis decrease will happen.
+        x4 = paddle.ones((2, 3, 4))
+        indice = paddle.ones([1], dtype='int32')
+        out5 = x4[indice]
+        out6 = x4[indice, indice]
+        res = self.exe.run(prog, fetch_list=[out5, out6])
+
+        self.assertEqual(res[0].shape, (1, 3, 4))
+        np.testing.assert_allclose(res[0], np.ones((1, 3, 4)))
+        self.assertEqual(res[1].shape, (1, 4))
+        np.testing.assert_allclose(res[1], np.ones((1, 4)))
+
+    @prog_scope()
+    def test_setitem(self):
+        # NOTE(zoooo0820): __setitem__ has gradient problem in static graph.
+        # To solve this, we may not support __setitem__ in static graph.
+        # These unit tests will delete soon.
+
+        # case1: all axis have a scalar indice
+        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
+        x.stop_gradient = False
+        out = x * 2
+        out = paddle.static.setitem(out, (1, 2, 3, 4), 10)
+        paddle.static.append_backward(out.sum())
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x.grad_name])
+
+        self.assertEqual(out.shape, x.shape)
+        np.testing.assert_allclose(res[0][1, 2, 3, 4], np.array(10))
+        self.assertEqual(res[1].shape, (2, 3, 4, 5))
+        x_grad_expected = np.ones((2, 3, 4, 5)) * 2
+        x_grad_expected[1, 2, 3, 4] = 0
+        np.testing.assert_allclose(res[1], x_grad_expected)
+
+        # case2: 0-D Tensor indice in some axis
+        # NOTE(zoooo0820): Now, int/slice with 0-D Tensor will still be
+        # treated as combined indexing, which is not support backward.
+        # There should have more test cases such as out[1, indice, :] = 0.5 when this
+        # problem is fixed.
+        x = paddle.randn((2, 3, 4, 5))
+        x.stop_gradient = False
+        indice = paddle.full([], 1, dtype='int32')
+        out = x * 1
+        out = paddle.static.setitem(out, (indice, indice), 0.5)
+        paddle.static.append_backward(out.sum())
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x.grad_name])
+
+        self.assertEqual(out.shape, x.shape)
+        np.testing.assert_allclose(res[0][1, 1], np.ones((4, 5)) * 0.5)
+        x_grad_expected = np.ones((2, 3, 4, 5))
+        x_grad_expected[1, 1] = 0
+        np.testing.assert_allclose(res[1], x_grad_expected)
+
+        # case3：0-D Tensor indice in some axis, value is a Tensor
+        # and there is broadcast
+        x = paddle.randn((2, 3, 4, 5))
+        x.stop_gradient = False
+        v = paddle.ones((4, 5), dtype='float32') * 5
+        v.stop_gradient = False
+        indice = paddle.full([], 1, dtype='int32')
+        out = x * 1
+        out = paddle.static.setitem(out, indice, v)
+        paddle.static.append_backward(out.sum())
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x.grad_name, v.grad_name])
+
+        self.assertEqual(out.shape, x.shape)
+        np.testing.assert_allclose(res[0][1], np.ones((3, 4, 5)) * 5)
+        x_grad_expected = np.ones((2, 3, 4, 5))
+        x_grad_expected[1] = 0
+        np.testing.assert_allclose(res[1], x_grad_expected)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_expand(self):
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+        out = paddle.expand(x, shape=[1])
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1.0)
+        self.assertEqual(res[3].shape, (1,))
+        self.assertEqual(res[3], 1.0)
+
+        x1 = paddle.full([], 1, 'float32')
+        x1.stop_gradient = False
+        out1 = paddle.expand(x1, shape=[])
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x1, out1] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1.0)
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1.0)
+
+        x2 = paddle.full([], 1, 'float32')
+        x2.stop_gradient = False
+        out2 = paddle.expand(x2, shape=[3, 3])
+        grad_list = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2, out2]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x2, out2] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, (3, 3))
+        self.assertEqual(res[1].any(), 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 9)
+        self.assertEqual(res[3].shape, (3, 3))
+        self.assertEqual(res[3].any(), 1.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_expand_as(self):
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+        y = paddle.full([], 1, 'float32')
+        y.stop_gradient = False
+        out = paddle.expand_as(x, y)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1.0)
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1.0)
+
+        x1 = paddle.full([], 1, 'float32')
+        x1.stop_gradient = False
+        y1 = paddle.full([1], 1, 'float32')
+        y1.stop_gradient = False
+        out1 = paddle.expand_as(x1, y1)
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x1, out1] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1.0)
+        self.assertEqual(res[3].shape, (1,))
+        self.assertEqual(res[3], 1.0)
+
+        x2 = paddle.full([], 1, 'float32')
+        x2.stop_gradient = False
+        y2 = paddle.full([3, 3], 1, 'float32')
+        y2.stop_gradient = False
+        out2 = paddle.expand_as(x2, y2)
+        grad_list = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2, out2]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x2, out2] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, (3, 3))
+        self.assertEqual(res[1].any(), 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 9)
+        self.assertEqual(res[3].shape, (3, 3))
+        self.assertEqual(res[3].any(), 1.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_top_k(self):
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+        out, indices = paddle.topk(x, k=1, axis=0)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out, indices] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 0.0)
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1.0)
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[4], 1.0)
+
+        x1 = paddle.full([], 1, 'float32')
+        x1.stop_gradient = False
+        out1, indices1 = paddle.topk(x1, k=1, axis=-1)
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x1, out1, indices1] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 0.0)
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1.0)
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[4], 1.0)
+
+        with self.assertRaises(ValueError):
+            tmp = paddle.topk(x1, k=1, axis=2)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_broadcast_to(self):
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+        out = paddle.broadcast_to(x, shape=[1])
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1.0)
+        self.assertEqual(res[3].shape, (1,))
+        self.assertEqual(res[3], 1.0)
+
+        x1 = paddle.full([], 1, 'float32')
+        x1.stop_gradient = False
+        out1 = paddle.broadcast_to(x1, shape=[])
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x1, out1] + grad_list)
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1.0)
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1.0)
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_argmin(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        out1 = paddle.argmin(x, 0)
+        out2 = paddle.argmin(x, -1)
+        out3 = paddle.argmin(x, None)
+
+        # 2) x is ND
+        x4 = paddle.rand([3, 5])
+        out4 = paddle.argmin(x, None)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                out3,
+                out4,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_allclose(res[0], 0.0)
+        self.assertEqual(res[1].shape, ())
+        np.testing.assert_allclose(res[1], 0.0)
+        self.assertEqual(res[2].shape, ())
+        np.testing.assert_allclose(res[2], 0.0)
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_argmax(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        out1 = paddle.argmax(x, 0)
+        out2 = paddle.argmax(x, -1)
+        out3 = paddle.argmax(x, None)
+
+        # 2) x is ND
+        x4 = paddle.rand([3, 5])
+        out4 = paddle.argmax(x, None)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                out3,
+                out4,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_allclose(res[0], 0.0)
+        self.assertEqual(res[1].shape, ())
+        np.testing.assert_allclose(res[1], 0.0)
+        self.assertEqual(res[2].shape, ())
+        np.testing.assert_allclose(res[2], 0.0)
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_kthvalue(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out, index = paddle.kthvalue(x, 1)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out, index] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertTrue(res[1] == res[0])
+        self.assertEqual(res[2].shape, ())
+        self.assertTrue(res[2] == 0)
+
+        self.assertEqual(res[3].shape, ())
+        self.assertTrue(res[3] == 1.0)
+
+        # 2) x is 1D
+        x1 = paddle.rand([5])
+        x1.stop_gradient = False
+        out1, index1 = paddle.kthvalue(x1, 1)
+        grad_list = paddle.static.append_backward(out1, parameter_list=[x1])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out1, index1] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (5,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_mode(self):
+        # 1) x is 0D
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out, index = paddle.mode(x)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, index] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertTrue(res[2] == 1.0)
+
+        # 2) x is 1D
+        x1 = paddle.rand([5])
+        x1.stop_gradient = False
+        out1, index1 = paddle.mode(x1)
+        grad_list = paddle.static.append_backward(out1, parameter_list=[x1])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out1, index1] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (5,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_is_empty(self):
+        # 1) x is 0D
+        x1 = paddle.rand([])
+        out1 = paddle.is_empty(x1)
+
+        # 2) x is 1D
+        x2 = paddle.rand([5])
+        out2 = paddle.is_empty(x2)
+
+        # 3) x is ND
+        x3 = paddle.rand([3, 5])
+        out3 = paddle.is_empty(x3)
+
+        x4 = paddle.rand([3, 0, 5])
+        out4 = paddle.is_empty(x4)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[out1, out2, out3, out4],
+        )
+
+        self.assertEqual(res[0].shape, ())
+        self.assertFalse(bool(res[0]))
+        self.assertEqual(res[1].shape, ())
+        self.assertFalse(bool(res[1]))
+        self.assertEqual(res[2].shape, ())
+        self.assertFalse(bool(res[2]))
+        self.assertEqual(res[3].shape, ())
+        self.assertTrue(bool(res[3]))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_as_complex(self):
+        x = paddle.rand([2])
+        x.stop_gradient = False
+        out = paddle.as_complex(x)
+        self.assertShapeEqual(
+            x,
+            [
+                2,
+            ],
+        )
+        self.assertShapeEqual(out, [])
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[x, out] + grad_list,
+        )
+
+        self.assertEqual(res[0].shape, (2,))
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (2,))
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_dot(self):
+        # 1) x is 1d
+        x = paddle.rand([2])
+        x.stop_gradient = False
+        y = paddle.rand([2])
+        y.stop_gradient = False
+        out = paddle.dot(x, y)
+
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        x_grad = grad_list[0][1]
+        out_grad = grad_list[1][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[x, x_grad, out, out_grad],
+        )
+
+        self.assertEqual(res[0].shape, (2,))
+        self.assertEqual(res[1].shape, (2,))
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+        # 2) x is 2D
+        x1 = paddle.rand([2, 2])
+        x1.stop_gradient = False
+        y1 = paddle.rand([2, 2])
+        y1.stop_gradient = False
+        out1 = paddle.dot(x1, y1)
+
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        x1_grad = grad_list[0][1]
+        out1_grad = grad_list[1][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[x1, x1_grad, out1, out1_grad],
+        )
+
+        self.assertEqual(res[0].shape, (2, 2))
+        self.assertEqual(res[1].shape, (2, 2))
+        self.assertEqual(res[2].shape, (2,))
+        self.assertEqual(res[3].shape, (2,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_inner(self):
+        # 1) input is 1D
+        x1 = paddle.rand([2])
+        x1.stop_gradient = False
+        y1 = paddle.rand([2])
+        y1.stop_gradient = False
+        out1 = paddle.inner(x1, y1)
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        x1_grad = grad_list[0][1]
+        out1_grad = grad_list[1][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                x1,
+                x1_grad,
+                out1,
+                out1_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, (2,))
+        self.assertEqual(res[1].shape, (2,))
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+        # 2) input is 2D
+        x = paddle.rand([2, 3])
+        x.stop_gradient = False
+        y = paddle.rand([2, 3])
+        y.stop_gradient = False
+        out = paddle.inner(x, y)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        x_grad = grad_list[0][1]
+        out_grad = grad_list[1][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                x,
+                x_grad,
+                out,
+                out_grad,
+            ],
+        )
+
+        self.assertEqual(res[0].shape, (2, 3))
+        self.assertEqual(res[1].shape, (2, 3))
+        self.assertEqual(res[2].shape, (2, 2))
+        self.assertEqual(res[3].shape, (2, 2))
+
+    @prog_scope()
+    def test_tensordot(self):
+        x = paddle.full(shape=[10], fill_value=0.25, dtype='float64')
+        x.stop_gradient = False
+        y = paddle.full(shape=[10], fill_value=0.25, dtype='float64')
+        y.stop_gradient = False
+        out = paddle.tensordot(x, y, axes=1)
+
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        x_grad = grad_list[0][1]
+        out_grad = grad_list[1][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[x, x_grad, out, out_grad],
+        )
+
+        self.assertEqual(res[0].shape, (10,))
+        self.assertEqual(res[1].shape, (10,))
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+        x = paddle.arange(6, dtype='float64').reshape([2, 3])
+        y = paddle.arange(6, dtype='float64').reshape([2, 3])
+        x.stop_gradient = False
+        out = paddle.tensordot(x, y, axes=2)
+
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        x_grad = grad_list[0][1]
+        out_grad = grad_list[1][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[x, x_grad, out, out_grad],
+        )
+
+        self.assertEqual(res[0].shape, (2, 3))
+        self.assertEqual(res[1].shape, (2, 3))
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_metric_accuracy(self):
+        x = paddle.full(shape=[2, 4], fill_value=0.25)
+        y = paddle.full(shape=[2, 1], fill_value=1, dtype="int64")
+        out = paddle.metric.accuracy(input=x, label=y, k=1)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[out],
+        )
+
+        self.assertEqual(res[0].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_static_accuracy(self):
+        x = paddle.full(shape=[2, 4], fill_value=0.25)
+        y = paddle.full(shape=[2, 1], fill_value=1, dtype="int64")
+        out = paddle.static.accuracy(input=x, label=y, k=1)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[out],
+        )
+
+        self.assertEqual(res[0].shape, ())
+
+    @prog_scope()
+    def test_static_auc(self):
+        x = paddle.full(shape=[3, 2], fill_value=0.25)
+        y = paddle.full(shape=[3], fill_value=1, dtype="int64")
+        out = paddle.static.auc(input=x, label=y)[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[out],
+        )
+
+        self.assertEqual(res[0].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_std(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out1 = paddle.std(x)
+        out2 = paddle.std(x, [])
+        grad_list = paddle.static.append_backward(
+            out1, parameter_list=[x, out1]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                x,
+                out1,
+                out2,
+            ]
+            + grad_list,
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[4].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_var(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out1 = paddle.var(x)
+        out2 = paddle.var(x, [])
+        grad_list = paddle.static.append_backward(
+            out1, parameter_list=[x, out1]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                x,
+                out1,
+                out2,
+            ]
+            + grad_list,
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[4].shape, ())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_sundry_static_api_part2.py b/test/legacy_test/test_zero_dim_sundry_static_api_part2.py
new file mode 100644
index 0000000000000..fd7f2cef323a9
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_sundry_static_api_part2.py
@@ -0,0 +1,1030 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import numpy as np
+from decorator_helper import prog_scope
+
+import paddle
+from paddle.pir_utils import test_with_pir_api
+
+# Use to test zero-dim of Sundry API, which is unique and can not be classified
+# with others. It can be implemented here flexibly.
+
+
+class TestSundryAPIStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.exe = paddle.static.Executor()
+
+    def assertShapeEqual(self, out, target_tuple):
+        if not paddle.framework.in_pir_mode():
+            out_shape = list(out.shape)
+        else:
+            out_shape = out.shape
+        self.assertEqual(out_shape, target_tuple)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_quantile(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+        out1 = paddle.quantile(x1, 0.5, axis=None)
+        grad_list1 = paddle.static.append_backward(
+            out1, parameter_list=[x1, out1]
+        )
+        grad_list1 = [_grad for _param, _grad in grad_list1]
+
+        x2 = paddle.rand([2, 3])
+        x2.stop_gradient = False
+        out2 = paddle.quantile(x2, 0.5, axis=None)
+        grad_list2 = paddle.static.append_backward(
+            out2, parameter_list=[x2, out2]
+        )
+        grad_list2 = [_grad for _param, _grad in grad_list2]
+
+        out_empty_list = paddle.quantile(x1, 0.5, axis=[])
+        self.assertShapeEqual(out_empty_list, [])
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+            ]
+            + grad_list1
+            + grad_list2,
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1.0)
+
+        self.assertEqual(res[4].shape, (2, 3))
+        self.assertEqual(res[5].shape, ())
+        self.assertEqual(res[5], 1.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_nanquantile(self):
+        # 1) x is 0D
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+        out1 = paddle.nanquantile(x1, 0.5, axis=None)
+        grad_list = paddle.static.append_backward(out1, parameter_list=[x1])
+        x1_grad = grad_list[0][1]
+
+        # 2) x is ND with 'nan'
+        x2 = paddle.to_tensor([[float('nan'), 2.0, 3.0], [0.0, 1.0, 2.0]])
+        x2.stop_gradient = False
+        out2 = paddle.nanquantile(x2, 0.5, axis=None)
+        print(out2)
+        grad_list = paddle.static.append_backward(out2, parameter_list=[x2])
+        x2_grad = grad_list[0][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                x1_grad,
+                out2,
+                x2_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, (2, 3))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_flip(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.flip(x, axis=[])
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_equal_scalar(self):
+        x = paddle.rand([])
+        out = paddle.equal(x, 2.0)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], False)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_pow_scalar(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.pow(x, 2.0)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_cast(self):
+        x = paddle.full([], 1.0, 'float32')
+        x.stop_gradient = False
+        out = paddle.cast(x, 'int32')
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_cumprod(self):
+        x = paddle.full([], 1.0, 'float32')
+        x.stop_gradient = False
+        out = paddle.cumprod(x, 0)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+
+        with self.assertRaises(ValueError):
+            tmp = paddle.cumprod(x, 2)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_clip(self):
+        x = paddle.uniform([], None, -10, 10)
+        x.stop_gradient = False
+        out = paddle.clip(x, -5, 5)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        x_grad, out_grad = (_grad for _param, _grad in grad_list)
+
+        x1 = paddle.uniform([], None, -10, 10)
+        x1.stop_gradient = False
+        out1 = paddle.clip(x1, paddle.full([], -5.0), paddle.full([], 5.0))
+        grad_list = paddle.static.append_backward(
+            out1, parameter_list=[x1, out1]
+        )
+        x1_grad, out1_grad = (_grad for _param, _grad in grad_list)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                x,
+                out,
+                x_grad,
+                out_grad,
+                x1,
+                out1,
+                x1_grad,
+                out1_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[5].shape, ())
+        self.assertEqual(res[6].shape, ())
+        self.assertEqual(res[7].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_increment(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.increment(x, 1.0)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+
+        prog = paddle.static.default_main_program()
+        if paddle.framework.in_pir_mode():
+            grad_list = [_grad for _param, _grad in grad_list if _grad]
+            res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
+            self.assertEqual(res[0].shape, ())
+            self.assertEqual(res[1].shape, ())
+            if len(grad_list) > 0:
+                self.assertEqual(res[2].shape, ())
+            if len(grad_list) > 1:
+                self.assertEqual(res[3].shape, ())
+        else:
+            res = self.exe.run(
+                prog, fetch_list=[x, out, x.grad_name, out.grad_name]
+            )
+            self.assertEqual(res[0].shape, ())
+            self.assertEqual(res[1].shape, ())
+            self.assertEqual(res[2].shape, ())
+            self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_bitwise_not(self):
+        # have no backward
+        x = paddle.randint(-1, 1, [])
+        out = paddle.bitwise_not(x)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_logical_not(self):
+        # have no backward
+        x = paddle.randint(0, 1, [])
+        out = paddle.logical_not(x)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_searchsorted(self):
+        # have no backward
+        x = paddle.full([10], 1.0, 'float32')
+        y = paddle.full([], 1.0, 'float32')
+        out = paddle.searchsorted(x, y)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_transpose(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.transpose(x, [])
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1.0)
+
+        with self.assertRaises(ValueError):
+            x = paddle.transpose(x, [0])
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_moveaxis(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.moveaxis(x, [], [])
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[1], 1.0)
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1.0)
+
+        with self.assertRaises(AssertionError):
+            x = paddle.moveaxis(x, [0], [1])
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_gather_1D(self):
+        x = paddle.full([10], 1.0, 'float32')
+        x.stop_gradient = False
+        index = paddle.full([], 2, 'int64')
+        out = paddle.gather(x, index)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1)
+        self.assertEqual(res[1].shape, (10,))
+        self.assertEqual(res[2].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_gather_XD_axis_0(self):
+        x = paddle.full([2, 3], 1.0, 'float32')
+        x.stop_gradient = False
+        index = paddle.full([], 1, 'int64')
+        out = paddle.gather(x, index)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, (3,))
+        np.testing.assert_array_equal(res[0], [1.0, 1.0, 1.0])
+        self.assertEqual(res[1].shape, (2, 3))
+        self.assertEqual(res[2].shape, (3,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_gather_XD_axis_1(self):
+        x = paddle.full([2, 3], 1.0, 'float32')
+        x.stop_gradient = False
+        index = paddle.full([], 1, 'int64')
+        out = paddle.gather(x, index, axis=1)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, (2,))
+        np.testing.assert_array_equal(res[0], [1.0, 1.0])
+        self.assertEqual(res[1].shape, (2, 3))
+        self.assertEqual(res[2].shape, (2,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_gather_nd(self):
+        x1 = paddle.full([10], 1.0, 'float32')
+        x1.stop_gradient = False
+        x2 = paddle.full([2, 3], 1.0, 'float32')
+        x2.stop_gradient = False
+
+        index1 = paddle.full([1], 1, 'int64')
+        index2 = paddle.full([2], 1, 'int64')
+
+        out1 = paddle.gather_nd(x1, index1)
+        out2 = paddle.gather_nd(x2, index2)
+        grad_list1 = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        grad_list2 = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2, out2]
+        )
+
+        (_, x1_grad), (_, out1_grad) = grad_list1
+        (_, x2_grad), (_, out2_grad) = grad_list2
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                x1_grad,
+                x2_grad,
+                out1_grad,
+                out2_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        np.testing.assert_array_equal(res[0], 1.0)
+        np.testing.assert_array_equal(res[1], 1.0)
+        self.assertEqual(res[2].shape, (10,))
+        self.assertEqual(res[3].shape, (2, 3))
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[5].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_scatter_1D(self):
+        x = paddle.full([10], 1.0, 'float32')
+        x.stop_gradient = False
+        index = paddle.full([], 2, 'int64')
+        updates = paddle.full([], 4, 'float32')
+        out = paddle.scatter(x, index, updates)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, (10,))
+        self.assertEqual(res[0][2], 4.0)
+        self.assertEqual(res[1].shape, (10,))
+        self.assertEqual(res[2].shape, (10,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_scatter_XD(self):
+        x = paddle.full([2, 3], 1.0, 'float32')
+        x.stop_gradient = False
+        index = paddle.full([], 1, 'int64')
+        updates = paddle.full([3], 4, 'float32')
+        out = paddle.scatter(x, index, updates)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, (2, 3))
+        np.testing.assert_array_equal(res[0][1], [4.0, 4.0, 4.0])
+        self.assertEqual(res[1].shape, (2, 3))
+        self.assertEqual(res[2].shape, (2, 3))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_diagflat(self):
+        # have no backward
+        x1 = paddle.rand([])
+        out1 = paddle.diagflat(x1, 1)
+
+        x2 = paddle.rand([])
+        out2 = paddle.diagflat(x2, -1)
+
+        x3 = paddle.rand([])
+        out3 = paddle.diagflat(x3)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out1, out2, out3])
+        self.assertEqual(res[0].shape, (2, 2))
+        self.assertEqual(res[1].shape, (2, 2))
+        self.assertEqual(res[2].shape, (1, 1))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_scatter__1D(self):
+        x = paddle.full([10], 1.0, 'float32')
+        index = paddle.full([], 2, 'int64')
+        updates = paddle.full([], 4, 'float32')
+        out = paddle.scatter_(x, index, updates)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0][2], 4)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_scatter__XD(self):
+        x = paddle.full([2, 3], 1.0, 'float32')
+        index = paddle.full([], 1, 'int64')
+        updates = paddle.full([3], 4, 'float32')
+        out = paddle.scatter_(x, index, updates)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        np.testing.assert_array_equal(res[0][1], [4.0, 4.0, 4.0])
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_scatter_nd(self):
+        index = paddle.full([1], 3, dtype='int64')
+        updates = paddle.full([], 2, 'float32')
+        updates.stop_gradient = False
+        out = paddle.scatter_nd(index, updates, [5])
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[out, updates]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, (5,))
+        self.assertEqual(res[0][3], 2)
+        self.assertEqual(res[1].shape, (5,))
+        self.assertEqual(res[2].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_flatten(self):
+        x = paddle.full([], 1, 'float32')
+        x.stop_gradient = False
+
+        start_axis = 0
+        stop_axis = -1
+
+        out = paddle.flatten(x, start_axis=start_axis, stop_axis=stop_axis)
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[x, out]
+        )
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={}, fetch_list=[out] + grad_list)
+
+        self.assertEqual(res[0].shape, (1,))
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, (1,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_histogram(self):
+        x = paddle.full([], 1, 'float32')
+        out = paddle.histogram(x, bins=5, min=1, max=5)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={}, fetch_list=[out])
+
+        self.assertEqual(res[0].shape, (5,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_scale(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        out = paddle.scale(x, scale=2.0, bias=1.0)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        grad_list = [_grad for _param, _grad in grad_list]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out] + grad_list)
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_floor_divide(self):
+        # 1-d // 0-d
+        x = paddle.to_tensor([1, -2, 3], dtype="int64")
+        y = paddle.full([], 2, dtype='int64')
+        out1_1 = paddle.floor_divide(x, y)
+        out1_2 = x // y
+
+        # 0-d // 1-d
+        out2_1 = paddle.floor_divide(y, x)
+        out2_2 = y // x
+
+        # 0-d // 0-d
+        x = paddle.full([], 3, dtype='int64')
+        out3_1 = paddle.floor_divide(x, y)
+        out3_2 = x // y
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog, fetch_list=[out1_1, out1_2, out2_1, out2_2, out3_1, out3_2]
+        )
+        out1_1, out1_2, out2_1, out2_2, out3_1, out3_2 = res
+
+        np.testing.assert_array_equal(out1_1, out1_2)
+        np.testing.assert_array_equal(out1_1, np.asarray([0, -1, 1]))
+        np.testing.assert_array_equal(out2_1, out2_2)
+        np.testing.assert_array_equal(out2_2, np.asarray([2, -1, 0]))
+        np.testing.assert_array_equal(out3_1, out3_2)
+        np.testing.assert_array_equal(out3_2, np.asarray(1))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_cumsum(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+
+        out1 = paddle.cumsum(x1)
+        out2 = paddle.cumsum(x1, axis=0)
+        out3 = paddle.cumsum(x1, axis=-1)
+
+        (_, x1_grad), (_, out1_grad) = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        (_, x1_grad), (_, out2_grad) = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x1, out2]
+        )
+        (_, x1_grad), (_, out3_grad) = paddle.static.append_backward(
+            out3.sum(), parameter_list=[x1, out3]
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                out3,
+                x1_grad,
+                out1_grad,
+                out2_grad,
+                out3_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, (1,))
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1.0)
+        self.assertEqual(res[4].shape, (1,))
+        self.assertEqual(res[4], 1.0)
+        self.assertEqual(res[5].shape, ())
+        self.assertEqual(res[5], 1.0)
+        self.assertEqual(res[6].shape, ())
+        self.assertEqual(res[6], 1.0)
+        self.assertShapeEqual(out2, [])
+        self.assertShapeEqual(out3, [])
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_logcumsumexp(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out1 = paddle.logcumsumexp(x)
+        out2 = paddle.logcumsumexp(x, axis=0)
+        out3 = paddle.logcumsumexp(x, axis=-1)
+
+        grad_list1 = paddle.static.append_backward(out1, parameter_list=[x])
+        grad_list2 = paddle.static.append_backward(out2, parameter_list=[x])
+        grad_list3 = paddle.static.append_backward(out3, parameter_list=[x])
+
+        x_grad = grad_list3[0][1]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                out3,
+                x_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, (1,))
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_add_n(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+        x2 = paddle.rand([])
+        x2.stop_gradient = False
+        x3 = paddle.rand([])
+        x3.stop_gradient = False
+
+        out1 = paddle.add_n(x1)
+        out2 = paddle.add_n([x2, x3])
+
+        grad_list1 = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        grad_list23 = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2, x3, out2]
+        )
+
+        (_, x1_grad), (_, out1_grad) = grad_list1
+        (_, x2_grad), (_, x3_grad), (_, out2_grad) = grad_list23
+
+        prog = paddle.static.default_main_program()
+        block = prog.global_block()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                x1_grad,
+                x2_grad,
+                x3_grad,
+                out1_grad,
+                out2_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 1)
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1)
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[4], 1)
+        self.assertEqual(res[5].shape, ())
+        self.assertEqual(res[6].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_reshape_list(self):
+        x1 = paddle.rand([])
+        x2 = paddle.rand([])
+        x3 = paddle.rand([])
+        x4 = paddle.rand([])
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+        x4.stop_gradient = False
+
+        out1 = paddle.reshape(x1, [])
+        grad_list1 = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        (_, x1_grad), (_, out1_grad) = grad_list1
+
+        out2 = paddle.reshape(x2, [1])
+        grad_list2 = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2, out2]
+        )
+        (_, x2_grad), (_, out2_grad) = grad_list2
+
+        out3 = paddle.reshape(x3, [-1])
+        grad_list3 = paddle.static.append_backward(
+            out3.sum(), parameter_list=[x3, out3]
+        )
+        (_, x3_grad), (_, out3_grad) = grad_list3
+
+        out4 = paddle.reshape(x4, [-1, 1])
+        grad_list4 = paddle.static.append_backward(
+            out4.sum(), parameter_list=[x4, out4]
+        )
+        (_, x4_grad), (_, out4_grad) = grad_list4
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                out3,
+                out4,
+                x1_grad,
+                x2_grad,
+                x3_grad,
+                x4_grad,
+                out1_grad,
+                out2_grad,
+                out3_grad,
+                out4_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[2].shape, (1,))
+        self.assertEqual(res[3].shape, (1, 1))
+
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[5].shape, ())
+        self.assertEqual(res[6].shape, ())
+        self.assertEqual(res[7].shape, ())
+
+        self.assertEqual(res[8].shape, ())
+        self.assertEqual(res[9].shape, (1,))
+        self.assertEqual(res[10].shape, (1,))
+        self.assertEqual(res[11].shape, (1, 1))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_reshape_tensor(self):
+        x1 = paddle.rand([1, 1])
+        x1.stop_gradient = False
+        new_shape = paddle.full([3], 1, "int32")
+        out1 = paddle.reshape(x1, new_shape)
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        (_, x1_grad), (_, out1_grad) = grad_list
+
+        x2 = paddle.rand([1, 1])
+        x2.stop_gradient = False
+        new_shape = paddle.full([1], -1, "int32")
+        out2 = paddle.reshape(x2, new_shape)
+        grad_list = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2, out2]
+        )
+        (_, x2_grad), (_, out2_grad) = grad_list
+
+        x3 = paddle.rand([1, 1])
+        x3.stop_gradient = False
+        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
+        out3 = paddle.reshape(x3, new_shape)
+        grad_list = paddle.static.append_backward(
+            out3.sum(), parameter_list=[x3, out3]
+        )
+        (_, x3_grad), (_, out3_grad) = grad_list
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                out3,
+                x1_grad,
+                x2_grad,
+                x3_grad,
+                out1_grad,
+                out2_grad,
+                out3_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, (1, 1, 1))
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[2].shape, (1, 1))
+
+        self.assertEqual(res[3].shape, (1, 1))
+        self.assertEqual(res[4].shape, (1, 1))
+        self.assertEqual(res[5].shape, (1, 1))
+
+        self.assertEqual(res[6].shape, (1, 1, 1))
+        self.assertEqual(res[7].shape, (1,))
+        self.assertEqual(res[8].shape, (1, 1))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_reverse(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+
+        out = paddle.reverse(x, axis=[])
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
+        (_, x_grad), (out_grad) = grad_list
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, out, x_grad, out_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_sort(self):
+        x1 = paddle.rand([])
+        x1.stop_gradient = False
+        out1 = paddle.sort(x1, axis=-1)
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        (_, x1_grad), (_, out1_grad) = grad_list
+
+        x2 = paddle.rand([])
+        x2.stop_gradient = False
+        out2 = paddle.sort(x2, axis=0)
+        grad_list = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2, out2]
+        )
+        (_, x2_grad), (_, out2_grad) = grad_list
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                out1_grad,
+                out2_grad,
+                x1_grad,
+                x2_grad,
+            ],
+        )
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[5].shape, ())
+        self.assertEqual(res[4], 1.0)
+        self.assertEqual(res[5], 1.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_argsort(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            # have no backward
+            x1 = paddle.rand([])
+            out1 = paddle.argsort(x1, axis=-1)
+
+            x2 = paddle.rand([])
+            x2.stop_gradient = False
+            out2 = paddle.argsort(x2, axis=0)
+
+            prog = paddle.static.default_main_program()
+            res = self.exe.run(prog, fetch_list=[out1, out2])
+
+            self.assertEqual(res[0].shape, ())
+            self.assertEqual(res[1].shape, ())
+            self.assertEqual(res[0], 0.0)
+            self.assertEqual(res[1], 0.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_lerp(self):
+        shapes = [
+            [(), (), (), ()],
+            [(), (64, 64), (), (64, 64)],
+            [(64, 64), (), (), (64, 64)],
+            [(64, 64), (), 0.5, (64, 64)],
+        ]
+        for shape in shapes:
+            x = paddle.rand(shape[0])
+            y = paddle.rand(shape[1])
+            if isinstance(shape[2], float):
+                w = shape[2]
+            else:
+                w = paddle.rand(shape[2])
+
+            x.stop_gradient = False
+            y.stop_gradient = False
+            out = paddle.lerp(x, y, w)
+            grad_list = paddle.static.append_backward(
+                out.sum(), parameter_list=[out, y, x]
+            )
+            (_, out_grad), (_, y_grad), (_, x_grad) = grad_list
+
+            prog = paddle.static.default_main_program()
+            res = self.exe.run(prog, fetch_list=[out, out_grad, y_grad, x_grad])
+            self.assertEqual(res[0].shape, shape[3])
+            self.assertEqual(res[1].shape, shape[3])
+            self.assertEqual(res[2].shape, shape[1])
+            self.assertEqual(res[3].shape, shape[0])
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_repeat_interleave(self):
+        x1 = paddle.full([], 1.0, 'float32')
+        x1.stop_gradient = False
+        out1 = paddle.repeat_interleave(x1, 2, None)
+        grad_list1 = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        (_, x1_grad), (_, out1_grad) = grad_list1
+
+        x2 = paddle.full([], 1.0, 'float32')
+        x2.stop_gradient = False
+        repeats = paddle.to_tensor([3], dtype='int32')
+        out2 = paddle.repeat_interleave(x2, repeats, None)
+        grad_list2 = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2, out2]
+        )
+        (_, x2_grad), (_, out2_grad) = grad_list2
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                x1_grad,
+                x2_grad,
+                out1_grad,
+                out2_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, (2,))
+        self.assertEqual(res[1].shape, (3,))
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[4].shape, (2,))
+        self.assertEqual(res[5].shape, (3,))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_sundry_static_api_part3.py b/test/legacy_test/test_zero_dim_sundry_static_api_part3.py
new file mode 100644
index 0000000000000..849abe24aeb73
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_sundry_static_api_part3.py
@@ -0,0 +1,990 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import numpy as np
+from decorator_helper import prog_scope
+
+import paddle
+from paddle.pir_utils import test_with_pir_api
+
+# Use to test zero-dim of Sundry API, which is unique and can not be classified
+# with others. It can be implemented here flexibly.
+
+
+class TestSundryAPIStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.exe = paddle.static.Executor()
+
+    def assertShapeEqual(self, out, target_tuple):
+        if not paddle.framework.in_pir_mode():
+            out_shape = list(out.shape)
+        else:
+            out_shape = out.shape
+        self.assertEqual(out_shape, target_tuple)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_allclose(self):
+        # 1) x is 0D
+        x = paddle.full([], 0.5)
+        y = paddle.full([], 0.6)
+        out = paddle.allclose(x, y)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        self.assertFalse(res[0])
+
+        # 2) x is ND
+        x = paddle.full([2, 3], 0.5)
+        y = paddle.full([2, 3], 0.6)
+        out = paddle.allclose(x, y)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        self.assertFalse(res[0])
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_equal_all(self):
+        # 1) x is 0D
+        x = paddle.full([], 0.5)
+        y = paddle.full([], 0.6)
+        out = paddle.equal_all(x, y)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        self.assertFalse(res[0])
+
+        # 2) x is ND
+        x = paddle.full([2, 3], 0.5)
+        y = paddle.full([2, 3], 0.6)
+        out = paddle.equal_all(x, y)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        self.assertFalse(res[0])
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_where(self):
+        x1 = paddle.full([], 1, 'float32')
+        x2 = paddle.full([], 2, 'float32')
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        out = paddle.where(x1 > x2, x1, x2)
+        loss = paddle.mean(out)
+        grad_list = paddle.static.append_backward(
+            loss, parameter_list=[out, x1, x2]
+        )
+        (_, out_grad), (_, x1_grad), (_, x2_grad) = grad_list
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            feed={},
+            fetch_list=[out, out_grad, x1_grad, x2_grad],
+        )
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 2)
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[2], 0)
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_atan2(self):
+        x1 = paddle.full([], 0, 'float32')
+        x2 = paddle.full([], 2, 'float32')
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        out = paddle.atan2(x1, x2)
+        paddle.static.append_backward(out)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={}, fetch_list=[out])
+
+        self.assertEqual(res[0].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_interpolate(self):
+        from paddle.nn.functional import interpolate
+
+        input_x = paddle.rand([2, 3, 6, 6])
+        input_x.stop_gradient = False
+
+        output_size = [
+            paddle.full([], 12, dtype="int32"),
+            paddle.full([], 12, dtype="int32"),
+        ]
+
+        out1 = interpolate(
+            x=input_x, size=output_size, mode="bilinear", align_corners=False
+        )
+        _, input_x_grad = paddle.static.append_backward(
+            out1.sum(), parameter_list=[input_x]
+        )[0]
+        prog = paddle.static.default_main_program()
+        res1 = self.exe.run(prog, feed={}, fetch_list=[out1, input_x_grad])
+
+        scale_1 = paddle.full([], 2)
+        out2 = interpolate(
+            x=input_x,
+            scale_factor=scale_1,
+            mode="bilinear",
+            align_corners=False,
+        )
+        _, input_x_grad = paddle.static.append_backward(
+            out2.sum(), parameter_list=[input_x]
+        )[0]
+        prog = paddle.static.default_main_program()
+        res2 = self.exe.run(prog, feed={}, fetch_list=[out2, input_x_grad])
+
+        self.assertEqual(res1[0].shape, (2, 3, 12, 12))
+        self.assertEqual(res1[1].shape, (2, 3, 6, 6))
+        self.assertEqual(res2[0].shape, (2, 3, 12, 12))
+        self.assertEqual(res2[1].shape, (2, 3, 6, 6))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_upsample(self):
+        from paddle.nn.functional import upsample
+
+        input_x = paddle.rand([2, 3, 6, 6])
+        input_x.stop_gradient = False
+
+        output_size = [
+            paddle.full([], 12, dtype="int32"),
+            paddle.full([], 12, dtype="int32"),
+        ]
+
+        out1 = upsample(
+            x=input_x, size=output_size, mode="bilinear", align_corners=False
+        )
+        _, input_x_grad = paddle.static.append_backward(
+            out1.sum(), parameter_list=[input_x]
+        )[0]
+        prog = paddle.static.default_main_program()
+        res1 = self.exe.run(prog, feed={}, fetch_list=[out1, input_x_grad])
+
+        self.assertEqual(res1[0].shape, (2, 3, 12, 12))
+        self.assertEqual(res1[1].shape, (2, 3, 6, 6))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_unstack(self):
+        x1 = paddle.full([1], 0, 'float32')
+        x1.stop_gradient = False
+        out1 = paddle.unstack(x1, 0)
+        out1 = paddle.add_n(out1)
+        _, x1_grad = paddle.static.append_backward(out1, parameter_list=[x1])[0]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={}, fetch_list=[out1, x1_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (1,))
+
+        x2 = paddle.full([2], 2, 'float32')
+        x2.stop_gradient = False
+        out2 = paddle.unstack(x2, 0)
+        out2_sum = paddle.add_n(out2)
+        _, x2_grad = paddle.static.append_backward(
+            out2_sum, parameter_list=[x2]
+        )[0]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={}, fetch_list=[out2_sum, x2_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_unbind(self):
+        x1 = paddle.full([1], 0, 'float32')
+        x1.stop_gradient = False
+        out1 = paddle.unbind(x1, 0)
+        out1 = paddle.add_n(out1)
+        _, x1_grad = paddle.static.append_backward(out1, parameter_list=[x1])[0]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={}, fetch_list=[out1, x1_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (1,))
+
+        x2 = paddle.full([2], 2, 'float32')
+        x2.stop_gradient = False
+        out2 = paddle.unbind(x2, 0)
+        out2_sum = paddle.add_n(out2)
+        _, x2_grad = paddle.static.append_backward(
+            out2_sum, parameter_list=[x2]
+        )[0]
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={}, fetch_list=[out2_sum, x2_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_masked_select(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        mask = paddle.full([], True, dtype='bool')
+        y = paddle.masked_select(x, mask)
+        grad_list = paddle.static.append_backward(
+            y.sum(), parameter_list=[y, x]
+        )
+        (_, y_grad), (_, x_grad) = grad_list
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[x, y, y_grad, x_grad])
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[1], res[0])
+        self.assertEqual(res[2].shape, (1,))
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[3], 1)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_squeeze(self):
+        x1 = paddle.full([], 2)
+        x1.stop_gradient = False
+        out1 = paddle.squeeze(x1, axis=0)
+        _, x1_grad = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1]
+        )[0]
+
+        x2 = paddle.full([], 3)
+        x3 = paddle.full([], 0, dtype='int32')
+        x2.stop_gradient = False
+        out2 = paddle.squeeze(x2, axis=x3)
+        _, x2_grad = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2]
+        )[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                x1_grad,
+                x2_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_unsqueeze(self):
+        x1 = paddle.full([], 2)
+        x1.stop_gradient = False
+        out1 = paddle.unsqueeze(x1, axis=0)
+        _, x1_grad = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1]
+        )[0]
+
+        x2 = paddle.full([], 3)
+        x3 = paddle.full([], 0, dtype='int32')
+        x2.stop_gradient = False
+        out2 = paddle.unsqueeze(x2, axis=x3)
+        _, x2_grad = paddle.static.append_backward(
+            out2.sum(), parameter_list=[x2]
+        )[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                x1_grad,
+                x2_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, (1,))
+        self.assertEqual(res[1].shape, (1,))
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+
+    @prog_scope()
+    def test_t(self):
+        x = paddle.full([], 2.0)
+        x.stop_gradient = False
+        out = paddle.t(x)
+        grad_list = paddle.static.append_backward(out, parameter_list=[out, x])
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog, feed={}, fetch_list=[out, out.grad_name, x.grad_name]
+        )
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+
+    @prog_scope()
+    def test_sequence_pad(self):
+        x = paddle.static.data("x", [-1, 2], dtype=paddle.int64, lod_level=1)
+        value = paddle.to_tensor(1000, dtype=paddle.int64).squeeze()
+        out = paddle.static.nn.sequence_pad(x, value)
+
+        x_tensor = paddle.base.create_lod_tensor(
+            np.arange(20).astype(np.int64).reshape(-1, 2),
+            [[3, 3, 4]],
+            place=self.exe.place,
+        )
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, feed={"x": x_tensor}, fetch_list=[out])
+        self.assertEqual(res[0].shape, (3, 4, 2))
+
+    @prog_scope()
+    def test_static_data(self):
+        x1 = paddle.static.data(name="x1", shape=[])
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            feed={
+                "x1": np.array(1.0, dtype='float32'),
+            },
+            fetch_list=[
+                x1.name,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], np.array(1.0))
+
+        x2 = paddle.static.data(name="x2", shape=[])
+        x3 = paddle.static.data(name="x3", shape=[])
+        y = x2 + x3
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            feed={
+                "x2": 100.5,
+                "x3": 200.5,
+            },
+            fetch_list=[
+                y.name,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 301.0)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_prelu(self):
+        x1 = paddle.full([], 1.0, 'float32')
+        x1.stop_gradient = False
+        w1 = paddle.to_tensor([0.25], dtype='float32')
+        out1 = paddle.nn.functional.prelu(x1, w1)
+        (_, out1_grad), (_, x1_grad) = paddle.static.append_backward(
+            out1.sum(), parameter_list=[out1, x1]
+        )
+
+        x2 = paddle.full([], 1.0, 'float32')
+        x2.stop_gradient = False
+        w2 = paddle.full([], 0.25, dtype='float32')
+        out2 = paddle.nn.functional.prelu(x2, w2)
+        (_, out2_grad), (_, x2_grad) = paddle.static.append_backward(
+            out2.sum(), parameter_list=[out2, x2]
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                out2,
+                x1_grad,
+                x2_grad,
+                out1_grad,
+                out2_grad,
+            ],
+        )
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        self.assertEqual(res[3].shape, ())
+        self.assertEqual(res[4].shape, ())
+        self.assertEqual(res[5].shape, ())
+
+    @prog_scope()
+    def test_static_nn_prelu(self):
+        x1 = paddle.full([], 1.0, 'float32')
+        x1.stop_gradient = False
+        out1 = paddle.static.nn.prelu(x1, 'all')
+        grad_list = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1, out1]
+        )
+        (_, x1_grad), (_, out1_grad) = grad_list
+
+        prog = paddle.static.default_main_program()
+        self.exe.run(paddle.static.default_startup_program())
+        res = self.exe.run(
+            prog,
+            fetch_list=[
+                out1,
+                x1_grad,
+                out1_grad,
+            ],
+        )
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[2].shape, ())
+        np.testing.assert_allclose(res[0], np.array(1))
+        np.testing.assert_allclose(res[1], np.array(1))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_while_loop(self):
+        def cond(i, x):
+            return paddle.less_than(i, eleven)
+
+        def body(i, x):
+            x = x + i
+            i = i + 1
+            return [i, x]
+
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, paddle.static.Program()):
+            i = paddle.static.data(name='i', shape=[], dtype='float32')
+            i.stop_gradient = False
+            i.persistable = True
+            eleven = paddle.full([], 11, 'float32')
+            x = paddle.static.data(name='x', shape=[], dtype='float32')
+            x.stop_gradient = False
+            x.persistable = True
+            out_i, out_x = paddle.static.nn.while_loop(cond, body, [i, x])
+            grad_list = paddle.static.append_backward(out_x)
+
+        feed = {
+            'i': np.array(1.0, dtype='float32'),
+            'x': np.array(0.0, dtype='float32'),
+        }
+        if paddle.framework.in_pir_mode():
+            fetch_list = [out_i, out_x]
+            for _, g in grad_list:
+                fetch_list.append(g)
+            res = self.exe.run(
+                main_program,
+                feed=feed,
+                fetch_list=fetch_list,
+            )
+        else:
+            res = self.exe.run(
+                main_program,
+                feed=feed,
+                fetch_list=[out_i.name, out_x.name, i.grad_name, x.grad_name],
+            )
+
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_allclose(res[0], np.array(11))
+        self.assertEqual(res[1].shape, ())
+        np.testing.assert_allclose(res[1], np.array(55))
+        self.assertEqual(res[2].shape, ())
+        np.testing.assert_allclose(res[2], np.array(10))
+        self.assertEqual(res[3].shape, ())
+        np.testing.assert_allclose(res[3], np.array(1.0))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_numel(self):
+        # 1) x is 0D
+        x = paddle.full([], 0.5)
+        out = paddle.numel(x)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_array_equal(res[0], np.array(1))
+
+        # 2) x is ND
+        x = paddle.full([3, 5], 0.5)
+        out = paddle.numel(x)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_array_equal(res[0], np.array(15))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_rank(self):
+        # 1) x is 0D
+        x = paddle.full([], 0.5)
+        out = paddle.rank(x)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_array_equal(res[0], np.array(0))
+
+        # 1) x is ND
+        x = paddle.full([3, 5], 0.5)
+        out = paddle.rank(x)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        self.assertEqual(res[0].shape, ())
+        np.testing.assert_array_equal(res[0], np.array(2))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_shape(self):
+        x = paddle.full([], 0.5)
+        out = paddle.shape(x)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out])
+        np.testing.assert_array_equal(res[0], np.array([]))
+        self.assertEqual(res[0].shape, (0,))
+
+    @test_with_pir_api
+    def test_broadcast_tensors(self):
+        # 1) x is 0D, y is 0D
+        x1 = paddle.full([], 2.0)
+        x1.stop_gradient = False
+        x2 = paddle.full([], 2.0)
+        x2.stop_gradient = False
+        out1, out2 = paddle.broadcast_tensors([x1, x2])
+
+        self.assertShapeEqual(out1, [])
+        self.assertShapeEqual(out2, [])
+
+        # 2) x is ND , y is 0D
+        x1 = paddle.full([2, 3], 2.0)
+        x1.stop_gradient = False
+        x2 = paddle.full([], 2.0)
+        x2.stop_gradient = False
+        out1, out2 = paddle.broadcast_tensors([x1, x2])
+
+        self.assertShapeEqual(out1, [2, 3])
+        self.assertShapeEqual(out2, [2, 3])
+
+        # 3) x is 0D , y is ND
+        x1 = paddle.full([], 2.0)
+        x1.stop_gradient = False
+        x2 = paddle.full([2, 3], 2.0)
+        x2.stop_gradient = False
+        out1, out2 = paddle.broadcast_tensors([x1, x2])
+
+        self.assertShapeEqual(out1, [2, 3])
+        self.assertShapeEqual(out2, [2, 3])
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_to_tensor(self):
+        out1 = paddle.to_tensor(1)
+        out2 = paddle.to_tensor(2.5)
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out1, out2])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[0], 1)
+        self.assertEqual(res[1].shape, ())
+        self.assertEqual(res[1], 2.5)
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_matmul(self):
+        # 1) no transpose
+        x = paddle.randn([10])
+        x.stop_gradient = False
+        y = paddle.randn([10])
+        y.stop_gradient = False
+        out = paddle.matmul(x, y)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, y])
+        (_, x_grad), (_, y_grad) = grad_list
+
+        self.assertShapeEqual(out, [])
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad, y_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (10,))
+        self.assertEqual(res[2].shape, (10,))
+
+        # 2) transpose x and y
+        x = paddle.randn([10])
+        x.stop_gradient = False
+        y = paddle.randn([10])
+        y.stop_gradient = False
+        out = paddle.matmul(x, y, True, True)
+        grad_list = paddle.static.append_backward(out, parameter_list=[x, y])
+        (_, x_grad), (_, y_grad) = grad_list
+
+        self.assertShapeEqual(out, [])
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad, y_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (10,))
+        self.assertEqual(res[2].shape, (10,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_slogdet(self):
+        # 2-D input
+        x = paddle.randn([3, 3])
+        x.stop_gradient = False
+        out = paddle.linalg.slogdet(x)
+        _, x_grad = paddle.static.append_backward(
+            out.sum(), parameter_list=[x]
+        )[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad])
+        self.assertEqual(res[0].shape, (2,))
+        self.assertEqual(res[1].shape, (3, 3))
+
+        # 3-D input
+        x1 = paddle.randn([3, 3, 3])
+        x1.stop_gradient = False
+        out1 = paddle.linalg.slogdet(x1)
+        _, x1_grad = paddle.static.append_backward(
+            out1.sum(), parameter_list=[x1]
+        )[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out1, x1_grad])
+        self.assertEqual(res[0].shape, (2, 3))
+        self.assertEqual(res[1].shape, (3, 3, 3))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_multi_dot(self):
+        a = paddle.randn([4])
+        a.stop_gradient = False
+        b = paddle.randn([4, 5])
+        b.stop_gradient = False
+        c = paddle.randn([5])
+        c.stop_gradient = False
+
+        out = paddle.linalg.multi_dot([a, b, c])
+        grad_list = paddle.static.append_backward(
+            out.sum(), parameter_list=[a, b, c]
+        )
+        (_, a_grad), (_, b_grad), (_, c_grad) = grad_list
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, a_grad, b_grad, c_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (4,))
+        self.assertEqual(res[2].shape, (4, 5))
+        self.assertEqual(res[3].shape, (5,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_cov(self):
+        xt_1 = paddle.randn((12,))
+        xt_1.stop_gradient = False
+        out = paddle.linalg.cov(xt_1)
+        _, xt_1_grad = paddle.static.append_backward(
+            out, parameter_list=[xt_1]
+        )[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, xt_1_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (12,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_corrcoef(self):
+        x = paddle.randn((12,))
+        x.stop_gradient = False
+        out = paddle.linalg.corrcoef(x)
+        _, x_grad = paddle.static.append_backward(out, parameter_list=[x])[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (12,))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_det(self):
+        xt_1 = paddle.randn((3, 3))
+        xt_1.stop_gradient = False
+
+        out = paddle.linalg.det(xt_1)
+        _, xt_1_grad = paddle.static.append_backward(
+            out.sum(), parameter_list=[xt_1]
+        )[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, xt_1_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 3))
+
+    @prog_scope()
+    def test_dist(self):
+        x = paddle.to_tensor([[3, 3], [3, 3]], dtype="float32")
+        y = paddle.to_tensor([[3, 3], [3, 1]], dtype="float32")
+        x.stop_gradient = False
+        y.stop_gradient = False
+        out = paddle.dist(x, y)
+        (_, x_grad), (_, y_grad) = paddle.static.append_backward(
+            out, parameter_list=[x, y]
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad, y_grad])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2, 2))
+        self.assertEqual(res[1].shape, (2, 2))
+        np.testing.assert_array_equal(res[0], np.array(2).astype(np.float32))
+
+    @prog_scope()
+    def test_linalg_norm(self):
+        # 1D input, p = fro ,axis = None, using reduceInferMeta
+        x_1 = paddle.arange(24, dtype="float32") - 12
+        x_1.stop_gradient = False
+        out_1 = paddle.linalg.norm(x_1)
+        grad_list = paddle.static.append_backward(out_1, parameter_list=[x_1])
+        ((_, x_1_grad),) = grad_list
+
+        prog = paddle.static.default_main_program()
+
+        res = self.exe.run(prog, fetch_list=[out_1, x_1_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (24,))
+
+        # 1D input, p = 1 ,axis = None,
+        # using p_norm, as_vector = True
+        x_2 = paddle.arange(24, dtype="float32") - 12
+        x_2.stop_gradient = False
+        out_2 = paddle.linalg.norm(x_2, p=1)
+        paddle.static.append_backward(out_2.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_2, x_2.grad_name])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (24,))
+
+        # 1D input, p = 1 ,axis = 0,
+        # using p_norm, as_vector = False
+        x_2_p = paddle.arange(24, dtype="float32") - 12
+        x_2_p.stop_gradient = False
+        out_2_p = paddle.linalg.norm(x_2_p, p=1, axis=0)
+        paddle.static.append_backward(out_2_p.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_2_p, x_2_p.grad_name])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (24,))
+
+        # 1D input, p = fro ,axis = 0,
+        # using p_norm, as_vector = False
+        x_2_fro = paddle.arange(24, dtype="float32") - 12
+        x_2_fro.stop_gradient = False
+        out_2_fro = paddle.linalg.norm(x_2_fro, p="fro", axis=0)
+        paddle.static.append_backward(out_2_fro.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_2_fro, x_2_fro.grad_name])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (24,))
+
+        # 2D input, p = 1, axis = [0, 1]
+        # using p_matrix_norm, depends on paddle.sum
+        x_3 = paddle.arange(24, dtype="float32").reshape([4, 6])
+        x_3.stop_gradient = False
+        out_3 = paddle.linalg.norm(x_3, p=1, axis=[0, 1])
+        paddle.static.append_backward(out_3.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_3, x_3.grad_name])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (4, 6))
+
+        # 2D input, p = 1, axis = None
+        # using p_matrix_norm, depends on paddle.sum
+        x_4 = paddle.arange(24, dtype="float32").reshape([4, 6])
+        x_4.stop_gradient = False
+        out_4 = paddle.linalg.norm(x_4)
+        paddle.static.append_backward(out_4.sum())
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_4, x_4.grad_name])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (4, 6))
+
+        # 2D input, p = inf, axis = None
+        x_5 = paddle.arange(24, dtype="float32").reshape([4, 6])
+        x_5.stop_gradient = False
+        out_5 = paddle.linalg.norm(x_5)
+        paddle.static.append_backward(out_5.sum())
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_5, x_5.grad_name])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (4, 6))
+
+        # 2D input, p = -inf, axis = [0, 1]
+        x_6 = paddle.arange(24, dtype="float32").reshape([4, 6])
+        x_6.stop_gradient = False
+        out_6 = paddle.linalg.norm(x_6, p=-float("inf"), axis=[0, 1])
+        paddle.static.append_backward(out_6.sum())
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_6, x_6.grad_name])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (4, 6))
+
+    @test_with_pir_api
+    @prog_scope()
+    def test_linalg_cond(self):
+        # use paddle.sum
+        x = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x.stop_gradient = False
+        out = paddle.linalg.cond(x)
+        _, x_grad = paddle.static.append_backward(out, parameter_list=[x])[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad])
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 3))
+
+        # p = fro : use paddle.sum
+        x2 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x2.stop_gradient = False
+        out_fro = paddle.linalg.cond(x2, p='fro')
+        grad_list = paddle.static.append_backward(out_fro, parameter_list=[x2])
+        ((_, x2_grad),) = grad_list
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_fro, x2_grad])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 3))
+
+        # p = nuc : use paddle.sum
+        x3 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x3.stop_gradient = False
+        out_nuc = paddle.linalg.cond(x3, p='nuc')
+        _, x3_grad = paddle.static.append_backward(
+            out_nuc, parameter_list=[x3]
+        )[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_nuc, x3_grad])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 3))
+
+        # p in (-1, 1) : use paddle.sum
+        x4 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x4.stop_gradient = False
+        out_1 = paddle.linalg.cond(x4, p=1)
+        _, x4_grad = paddle.static.append_backward(out_1, parameter_list=[x4])[
+            0
+        ]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_1, x4_grad])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 3))
+
+        x5 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x5.stop_gradient = False
+        out_minus_1 = paddle.linalg.cond(x5, p=-1)
+        ((_, x5_grad),) = paddle.static.append_backward(
+            out_minus_1, parameter_list=[x5]
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_minus_1, x5_grad])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 3))
+
+        # p in (-2, 2) depends on paddle.sum
+        x6 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x6.stop_gradient = False
+        out_2 = paddle.linalg.cond(x6, p=2)
+        ((_, x6_grad),) = paddle.static.append_backward(
+            out_2, parameter_list=[x6]
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_2, x6_grad])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 3))
+
+        # p in (-inf, inf):use paddle.sum
+        x8 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
+        x8.stop_gradient = False
+        out_inf = paddle.linalg.cond(x8, p=float("inf"))
+        ((_, x8_grad),) = paddle.static.append_backward(
+            out_inf, parameter_list=[x8]
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out_inf, x8_grad])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (3, 3))
+
+        # depends on paddle.sum
+        a = paddle.randn([2, 4, 4])
+        a.stop_gradient = False
+        a_cond_fro = paddle.linalg.cond(a, p='fro')
+        ((_, a_grad),) = paddle.static.append_backward(
+            a_cond_fro.sum(), parameter_list=[a]
+        )
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[a_cond_fro, a_grad])
+
+        self.assertEqual(res[0].shape, (2,))
+        self.assertEqual(res[1].shape, (2, 4, 4))
+
+    @prog_scope()
+    def test_trace(self):
+        x = paddle.to_tensor([[3, 2], [1, 9]], dtype="float32")
+        x.stop_gradient = False
+        out = paddle.trace(x)
+        _, x_grad = paddle.static.append_backward(out, parameter_list=[x])[0]
+
+        prog = paddle.static.default_main_program()
+        res = self.exe.run(prog, fetch_list=[out, x_grad])
+
+        self.assertEqual(res[0].shape, ())
+        self.assertEqual(res[1].shape, (2, 2))
+        np.testing.assert_allclose(res[0], np.array(12))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_tensor.py b/test/legacy_test/test_zero_dim_tensor.py
deleted file mode 100644
index f4ad78d3f72fd..0000000000000
--- a/test/legacy_test/test_zero_dim_tensor.py
+++ /dev/null
@@ -1,6935 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Note:
-# 0D Tensor indicates that the tensor's dimension is 0
-# 0D Tensor's shape is always [], numel is 1
-# which can be created by paddle.rand([])
-
-import os
-import unittest
-
-import numpy as np
-from decorator_helper import prog_scope
-
-import paddle
-import paddle.nn.functional as F
-from paddle import base, core
-from paddle.framework import in_dynamic_mode
-from paddle.pir_utils import test_with_pir_api
-
-unary_api_list = [
-    paddle.nn.functional.elu,
-    paddle.nn.functional.rrelu,
-    paddle.frac,
-    paddle.sgn,
-    paddle.nan_to_num,
-    paddle.i0,
-    paddle.i0e,
-    paddle.i1,
-    paddle.i1e,
-    paddle.nn.functional.gelu,
-    paddle.nn.functional.hardsigmoid,
-    paddle.nn.functional.hardswish,
-    paddle.nn.functional.hardshrink,
-    paddle.nn.functional.hardtanh,
-    paddle.nn.functional.leaky_relu,
-    paddle.nn.functional.log_sigmoid,
-    paddle.nn.functional.relu,
-    paddle.nn.functional.relu6,
-    paddle.nn.functional.sigmoid,
-    paddle.nn.functional.softplus,
-    paddle.nn.functional.softshrink,
-    paddle.nn.functional.softsign,
-    paddle.nn.functional.swish,
-    paddle.nn.functional.tanhshrink,
-    paddle.nn.functional.thresholded_relu,
-    paddle.stanh,
-    paddle.nn.functional.celu,
-    paddle.nn.functional.selu,
-    paddle.nn.functional.mish,
-    paddle.nn.functional.silu,
-    paddle.nn.functional.tanh,
-    paddle.nn.functional.dropout,
-    paddle.cosh,
-    paddle.sinh,
-    paddle.abs,
-    paddle.acos,
-    paddle.asin,
-    paddle.atan,
-    paddle.ceil,
-    paddle.cos,
-    paddle.exp,
-    paddle.floor,
-    paddle.log,
-    paddle.log1p,
-    paddle.reciprocal,
-    paddle.round,
-    paddle.sin,
-    paddle.sqrt,
-    paddle.square,
-    paddle.tanh,
-    paddle.acosh,
-    paddle.asinh,
-    paddle.atanh,
-    paddle.expm1,
-    paddle.log10,
-    paddle.log2,
-    paddle.tan,
-    paddle.erf,
-    paddle.erfinv,
-    paddle.rsqrt,
-    paddle.sign,
-    paddle.deg2rad,
-    paddle.rad2deg,
-    paddle.neg,
-    paddle.logit,
-    paddle.trunc,
-    paddle.digamma,
-    paddle.lgamma,
-    paddle.poisson,
-    paddle.bernoulli,
-    paddle.nn.functional.softmax,
-    paddle.nn.functional.log_softmax,
-    paddle.nn.functional.gumbel_softmax,
-    paddle.nn.functional.alpha_dropout,
-]
-
-inplace_unary_api_list = [
-    paddle.nn.functional.relu_,
-    paddle.nn.functional.tanh_,
-    paddle.tensor.sigmoid_,
-    paddle.tensor.ceil_,
-    paddle.tensor.floor_,
-    paddle.tensor.reciprocal_,
-    paddle.tensor.exp_,
-    paddle.tensor.sqrt_,
-]
-
-
-# Use to test zero-dim in unary API.
-class TestUnaryAPI(unittest.TestCase):
-    def test_dygraph_unary(self):
-        paddle.disable_static()
-        for api in unary_api_list:
-            x = paddle.rand([])
-            x.stop_gradient = False
-            out = api(x)
-
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(x.shape, [])
-            self.assertEqual(out.shape, [])
-            if x.grad is not None:
-                self.assertEqual(x.grad.shape, [])
-                self.assertEqual(out.grad.shape, [])
-
-        for api in inplace_unary_api_list:
-            x = paddle.rand([])
-            out = api(x)
-            self.assertEqual(x.shape, [])
-            self.assertEqual(out.shape, [])
-
-        paddle.enable_static()
-
-    @test_with_pir_api
-    def test_static_unary(self):
-        paddle.enable_static()
-
-        for api in unary_api_list:
-            main_prog = paddle.static.Program()
-            block = main_prog.global_block()
-            exe = paddle.static.Executor()
-            with paddle.static.program_guard(
-                main_prog, paddle.static.Program()
-            ):
-                x = paddle.rand([])
-                x.stop_gradient = False
-                out = api(x)
-                fetch_list = [x, out]
-                grad_list = paddle.static.append_backward(
-                    out, parameter_list=fetch_list
-                )
-                fetch_list.extend(
-                    [
-                        _grad
-                        for _param, _grad in grad_list
-                        if isinstance(
-                            _grad,
-                            (paddle.pir.Value, paddle.base.framework.Variable),
-                        )
-                    ]
-                )
-
-                # 1) Test Program
-                res = exe.run(main_prog, fetch_list=fetch_list)
-                for item in res:
-                    self.assertEqual(item.shape, ())
-
-                # 2) Test CompiledProgram Program
-                if not paddle.framework.in_pir_mode():
-                    compile_prog = paddle.static.CompiledProgram(main_prog)
-                    res = exe.run(compile_prog, fetch_list=fetch_list)
-                    for item in res:
-                        self.assertEqual(item.shape, ())
-
-        paddle.disable_static()
-
-
-reduce_api_list = [
-    paddle.sum,
-    paddle.mean,
-    paddle.nansum,
-    paddle.nanmean,
-    paddle.median,
-    paddle.nanmedian,
-    paddle.min,
-    paddle.max,
-    paddle.amin,
-    paddle.amax,
-    paddle.prod,
-    paddle.logsumexp,
-    paddle.all,
-    paddle.any,
-    paddle.count_nonzero,
-]
-
-
-# Use to test zero-dim of reduce API
-class TestReduceAPI(unittest.TestCase):
-    def assertShapeEqual(self, out, target_tuple):
-        if not paddle.framework.in_pir_mode():
-            out_shape = list(out.shape)
-        else:
-            out_shape = out.shape
-        self.assertEqual(out_shape, target_tuple)
-
-    def test_dygraph_reduce(self):
-        paddle.disable_static()
-        for api in reduce_api_list:
-            # 1) x is 0D
-            if api in [paddle.all, paddle.any]:
-                x = paddle.randint(0, 2, []).astype('bool')
-            else:
-                x = paddle.rand([])
-            x.stop_gradient = False
-            out = api(x, axis=None)
-
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(x.shape, [])
-            self.assertEqual(out.shape, [])
-            if api not in [paddle.count_nonzero]:
-                np.testing.assert_allclose(out.numpy(), x.numpy())
-
-            if api not in [paddle.median, paddle.nanmedian]:
-                out_empty_list = api(x, axis=[])
-                self.assertEqual(out_empty_list, out)
-                self.assertEqual(out_empty_list.shape, [])
-
-            if x.grad is not None:
-                self.assertEqual(x.grad.shape, [])
-                self.assertEqual(out.grad.shape, [])
-                np.testing.assert_allclose(x.grad.numpy(), np.array(1.0))
-                np.testing.assert_allclose(out.grad.numpy(), np.array(1.0))
-
-            out1 = api(x, axis=0)
-            self.assertEqual(out1.shape, [])
-            self.assertEqual(out1, out)
-            out1.backward()
-
-            out2 = api(x, axis=-1)
-            self.assertEqual(out2.shape, [])
-            self.assertEqual(out2, out)
-            out2.backward()
-
-            if x.grad is not None:
-                self.assertEqual(x.grad.shape, [])
-                np.testing.assert_allclose(x.grad.numpy(), np.array(3.0))
-
-            # 2) x is 1D, axis=0, reduce to 0D
-            if api in [paddle.all, paddle.any]:
-                x = paddle.randint(0, 2, [5]).astype('bool')
-            else:
-                x = paddle.rand([5])
-            x.stop_gradient = False
-            out = api(x, axis=0)
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(out.shape, [])
-            if x.grad is not None:
-                self.assertEqual(out.grad.shape, [])
-                self.assertEqual(x.grad.shape, [5])
-
-            # 3) x is ND, reduce to 0D
-            if api in [paddle.all, paddle.any]:
-                x = paddle.randint(0, 2, [3, 5]).astype('bool')
-            else:
-                x = paddle.rand([3, 5])
-            x.stop_gradient = False
-            out = api(x, axis=None)
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(out.shape, [])
-            if x.grad is not None:
-                self.assertEqual(out.grad.shape, [])
-                self.assertEqual(x.grad.shape, [3, 5])
-
-            # 4) x is ND, reduce to 0D, keepdim=True
-            if api in [paddle.all, paddle.any]:
-                x = paddle.randint(0, 2, [3, 5]).astype('bool')
-            else:
-                x = paddle.rand([3, 5])
-            x.stop_gradient = False
-            out = api(x, keepdim=True)
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(out.shape, [1, 1])
-            if x.grad is not None:
-                self.assertEqual(out.grad.shape, [1, 1])
-                self.assertEqual(x.grad.shape, [3, 5])
-
-        paddle.enable_static()
-
-    # TODO(SigureMo): Temporarily disable this test case in due to hanging in mac CI.
-    # @test_with_pir_api
-    def test_static_reduce(self):
-        paddle.enable_static()
-        for api in reduce_api_list:
-            main_prog = paddle.static.Program()
-            block = main_prog.global_block()
-            exe = paddle.static.Executor()
-            with paddle.static.program_guard(
-                main_prog, paddle.static.Program()
-            ):
-                # 1) x is 0D
-                if api in [paddle.all, paddle.any]:
-                    x = paddle.randint(0, 2, []).astype('bool')
-                else:
-                    x = paddle.rand([])
-                x.stop_gradient = False
-                out = api(x, axis=None)
-                grad_list = paddle.static.append_backward(
-                    out, parameter_list=[x, out]
-                )
-
-                if api not in [paddle.median, paddle.nanmedian]:
-                    out_empty_list = api(x, axis=[])
-                    self.assertShapeEqual(out_empty_list, [])
-
-                out1 = api(x, axis=0)
-                self.assertShapeEqual(out1, [])
-
-                out2 = api(x, axis=-1)
-                self.assertShapeEqual(out2, [])
-
-                fetch_list = [x, out]
-
-                fetch_list.extend(
-                    [
-                        _grad
-                        for _param, _grad in grad_list
-                        if isinstance(
-                            _grad,
-                            (paddle.pir.Value, paddle.base.framework.Variable),
-                        )
-                    ]
-                )
-                res = exe.run(main_prog, fetch_list=fetch_list)
-
-                self.assertEqual(res[0].shape, ())
-                self.assertEqual(res[1].shape, ())
-                if api not in [paddle.count_nonzero]:
-                    np.testing.assert_allclose(res[0], res[1])
-
-                if len(res) > 2:
-                    self.assertEqual(res[2].shape, ())
-                    self.assertEqual(res[3].shape, ())
-                    np.testing.assert_allclose(res[2], np.array(1.0))
-                    np.testing.assert_allclose(res[3], np.array(1.0))
-
-                # 2) x is ND, reduce to 0D
-                if api in [paddle.all, paddle.any]:
-                    x = paddle.randint(0, 2, [3, 5]).astype('bool')
-                else:
-                    x = paddle.rand([3, 5])
-                x.stop_gradient = False
-                out = api(x, axis=None)
-                grad_list = paddle.static.append_backward(
-                    out, parameter_list=[out, x]
-                )
-
-                fetch_list = [out]
-                fetch_list.extend(
-                    [
-                        _grad
-                        for _param, _grad in grad_list
-                        if isinstance(
-                            _grad,
-                            (paddle.pir.Value, paddle.base.framework.Variable),
-                        )
-                    ]
-                )
-
-                res = exe.run(main_prog, fetch_list=fetch_list)
-                self.assertEqual(res[0].shape, ())
-                if len(res) > 1:
-                    self.assertEqual(res[1].shape, ())
-                if len(res) > 2:
-                    self.assertEqual(res[2].shape, (3, 5))
-
-                # 3) x is 1D, axis=0, reduce to 0D
-                if api in [paddle.all, paddle.any]:
-                    x = paddle.randint(0, 2, [5]).astype('bool')
-                else:
-                    x = paddle.rand([5])
-                x.stop_gradient = False
-                out = api(x, axis=0)
-                grad_list = paddle.static.append_backward(
-                    out, parameter_list=[out, x]
-                )
-
-                fetch_list = [out]
-                fetch_list.extend(
-                    [
-                        _grad
-                        for _param, _grad in grad_list
-                        if isinstance(
-                            _grad,
-                            (paddle.pir.Value, paddle.base.framework.Variable),
-                        )
-                    ]
-                )
-
-                res = exe.run(main_prog, fetch_list=fetch_list)
-                self.assertEqual(res[0].shape, ())
-                if len(res) > 1:
-                    self.assertEqual(res[1].shape, ())
-                if len(res) > 2:
-                    self.assertEqual(res[2].shape, (5,))
-
-        paddle.disable_static()
-
-
-binary_api_list = [
-    {'func': paddle.add, 'cls_method': '__add__'},
-    {'func': paddle.subtract, 'cls_method': '__sub__'},
-    {'func': paddle.multiply, 'cls_method': '__mul__'},
-    {'func': paddle.divide, 'cls_method': '__div__'},
-    {'func': paddle.pow, 'cls_method': '__pow__'},
-    {'func': paddle.equal, 'cls_method': '__eq__'},
-    {'func': paddle.not_equal, 'cls_method': '__ne__'},
-    {'func': paddle.greater_equal, 'cls_method': '__ge__'},
-    {'func': paddle.greater_than, 'cls_method': '__gt__'},
-    {'func': paddle.less_equal, 'cls_method': '__le__'},
-    {'func': paddle.less_than, 'cls_method': '__lt__'},
-    {'func': paddle.remainder, 'cls_method': '__mod__'},
-    paddle.mod,
-    paddle.floor_mod,
-    paddle.logical_and,
-    paddle.logical_or,
-    paddle.logical_xor,
-    paddle.maximum,
-    paddle.minimum,
-    paddle.fmax,
-    paddle.fmin,
-    paddle.complex,
-    paddle.kron,
-    paddle.logaddexp,
-    paddle.nextafter,
-    paddle.ldexp,
-    paddle.polar,
-    paddle.heaviside,
-]
-
-binary_int_api_list = [
-    paddle.bitwise_and,
-    paddle.bitwise_or,
-    paddle.bitwise_xor,
-    paddle.gcd,
-    paddle.lcm,
-]
-
-
-inplace_binary_api_list = [
-    paddle.tensor.add_,
-    paddle.tensor.subtract_,
-    paddle.tensor.multiply_,
-    paddle.tensor.remainder_,
-    paddle.tensor.remainder_,
-]
-
-
-# Use to test zero-dim of binary API
-class TestBinaryAPI(unittest.TestCase):
-    def test_dygraph_binary(self):
-        paddle.disable_static()
-        for api in binary_api_list:
-            # 1) x is 0D, y is 0D
-            x = paddle.rand([])
-            y = paddle.rand([])
-            x.stop_gradient = False
-            y.stop_gradient = False
-            if isinstance(api, dict):
-                out = api['func'](x, y)
-                out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
-                np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
-            else:
-                out = api(x, y)
-
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(x.shape, [])
-            self.assertEqual(y.shape, [])
-            self.assertEqual(out.shape, [])
-            if x.grad is not None:
-                self.assertEqual(x.grad.shape, [])
-                self.assertEqual(y.grad.shape, [])
-                self.assertEqual(out.grad.shape, [])
-
-            # 2) x is ND, y is 0D
-            x = paddle.rand([2, 3, 4])
-            y = paddle.rand([])
-            x.stop_gradient = False
-            y.stop_gradient = False
-            if isinstance(api, dict):
-                out = api['func'](x, y)
-                out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
-                np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
-            else:
-                out = api(x, y)
-
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(x.shape, [2, 3, 4])
-            self.assertEqual(y.shape, [])
-            self.assertEqual(out.shape, [2, 3, 4])
-            if x.grad is not None:
-                self.assertEqual(x.grad.shape, [2, 3, 4])
-                self.assertEqual(y.grad.shape, [])
-                self.assertEqual(out.grad.shape, [2, 3, 4])
-
-            # 3) x is 0D , y is ND
-            x = paddle.rand([])
-            y = paddle.rand([2, 3, 4])
-            x.stop_gradient = False
-            y.stop_gradient = False
-            if isinstance(api, dict):
-                out = api['func'](x, y)
-                out_cls = getattr(paddle.Tensor, api['cls_method'])(x, y)
-                np.testing.assert_array_equal(out_cls.numpy(), out.numpy())
-            else:
-                out = api(x, y)
-
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(x.shape, [])
-            self.assertEqual(y.shape, [2, 3, 4])
-            self.assertEqual(out.shape, [2, 3, 4])
-            if x.grad is not None:
-                self.assertEqual(x.grad.shape, [])
-                self.assertEqual(y.grad.shape, [2, 3, 4])
-                self.assertEqual(out.grad.shape, [2, 3, 4])
-
-            # 4) x is 0D , y is scalar
-            x = paddle.rand([])
-            x.stop_gradient = False
-            y = 0.5
-            if isinstance(api, dict):
-                out = getattr(paddle.Tensor, api['cls_method'])(x, y)
-
-                out.retain_grads()
-                out.backward()
-
-                self.assertEqual(x.shape, [])
-                self.assertEqual(out.shape, [])
-                if x.grad is not None:
-                    self.assertEqual(x.grad.shape, [])
-                    self.assertEqual(out.grad.shape, [])
-
-        for api in binary_int_api_list:
-            # 1) x is 0D, y is 0D
-            x_np = np.random.randint(-10, 10, [])
-            y_np = np.random.randint(-10, 10, [])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
-
-            x = paddle.to_tensor(x_np)
-            y = paddle.to_tensor(y_np)
-            out = api(x, y)
-
-            self.assertEqual(out.shape, [])
-            np.testing.assert_array_equal(out.numpy(), out_np)
-
-            # 2) x is ND, y is 0D
-            x_np = np.random.randint(-10, 10, [3, 5])
-            y_np = np.random.randint(-10, 10, [])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
-
-            x = paddle.to_tensor(x_np)
-            y = paddle.to_tensor(y_np)
-            out = api(x, y)
-
-            self.assertEqual(out.shape, [3, 5])
-            np.testing.assert_array_equal(out.numpy(), out_np)
-
-            # 3) x is 0D , y is ND
-            x_np = np.random.randint(-10, 10, [])
-            y_np = np.random.randint(-10, 10, [3, 5])
-            out_np = eval('np.%s(x_np, y_np)' % api.__name__)
-
-            x = paddle.to_tensor(x_np)
-            y = paddle.to_tensor(y_np)
-            out = api(x, y)
-
-            self.assertEqual(out.shape, [3, 5])
-            np.testing.assert_array_equal(out.numpy(), out_np)
-
-        for api in inplace_binary_api_list:
-            with paddle.no_grad():
-                x = paddle.rand([])
-                y = paddle.rand([])
-                out = api(x, y)
-                self.assertEqual(x.shape, [])
-                self.assertEqual(out.shape, [])
-
-                x = paddle.rand([3, 5])
-                y = paddle.rand([])
-                out = api(x, y)
-                self.assertEqual(x.shape, [3, 5])
-                self.assertEqual(out.shape, [3, 5])
-
-        paddle.enable_static()
-
-    def test_static_binary(self):
-        paddle.enable_static()
-        for api in binary_api_list:
-            main_prog = paddle.static.Program()
-            block = main_prog.global_block()
-            with paddle.static.program_guard(
-                main_prog, paddle.static.Program()
-            ):
-                # 1) x is 0D, y is 0D
-                x = paddle.rand([])
-                y = paddle.rand([])
-                x.stop_gradient = False
-                y.stop_gradient = False
-                if isinstance(api, dict):
-                    out = api['func'](x, y)
-                    out_cls = getattr(
-                        paddle.static.Variable, api['cls_method']
-                    )(x, y)
-                    self.assertEqual(out.shape, out_cls.shape)
-                else:
-                    out = api(x, y)
-                paddle.static.append_backward(out)
-
-                self.assertEqual(x.shape, ())
-                self.assertEqual(y.shape, ())
-                self.assertEqual(out.shape, ())
-                if block.has_var(x.grad_name):
-                    out_grad = block.var(out.grad_name)
-                    x_grad = block.var(x.grad_name)
-                    y_grad = block.var(y.grad_name)
-
-                    self.assertEqual(x_grad.shape, ())
-                    self.assertEqual(y_grad.shape, ())
-                    self.assertEqual(out_grad.shape, ())
-
-                # 2) x is 0D, y is ND
-                x = paddle.rand([])
-                y = paddle.rand([2, 3, 4])
-                x.stop_gradient = False
-                y.stop_gradient = False
-                if isinstance(api, dict):
-                    out = api['func'](x, y)
-                    out_cls = getattr(
-                        paddle.static.Variable, api['cls_method']
-                    )(x, y)
-                    self.assertEqual(out.shape, out_cls.shape)
-                else:
-                    out = api(x, y)
-                paddle.static.append_backward(out)
-
-                self.assertEqual(x.shape, ())
-                self.assertEqual(y.shape, (2, 3, 4))
-                self.assertEqual(out.shape, (2, 3, 4))
-                if block.has_var(x.grad_name):
-                    out_grad = block.var(out.grad_name)
-                    x_grad = block.var(x.grad_name)
-                    y_grad = block.var(y.grad_name)
-
-                    self.assertEqual(x_grad.shape, ())
-                    self.assertEqual(y_grad.shape, (2, 3, 4))
-                    self.assertEqual(out_grad.shape, (2, 3, 4))
-
-                # 3) x is ND, y is 0d
-                x = paddle.rand([2, 3, 4])
-                y = paddle.rand([])
-                x.stop_gradient = False
-                y.stop_gradient = False
-                if isinstance(api, dict):
-                    out = api['func'](x, y)
-                    out_cls = getattr(
-                        paddle.static.Variable, api['cls_method']
-                    )(x, y)
-                    self.assertEqual(out.shape, out_cls.shape)
-                else:
-                    out = api(x, y)
-                paddle.static.append_backward(out)
-
-                self.assertEqual(x.shape, (2, 3, 4))
-                self.assertEqual(y.shape, ())
-                self.assertEqual(out.shape, (2, 3, 4))
-                if block.has_var(x.grad_name):
-                    out_grad = block.var(out.grad_name)
-                    x_grad = block.var(x.grad_name)
-                    y_grad = block.var(y.grad_name)
-
-                    self.assertEqual(x_grad.shape, (2, 3, 4))
-                    self.assertEqual(y_grad.shape, ())
-                    self.assertEqual(out_grad.shape, (2, 3, 4))
-
-                # 4) x is 0D , y is scalar
-                x = paddle.rand([])
-                x.stop_gradient = False
-                y = 0.5
-                if isinstance(api, dict):
-                    out = getattr(paddle.static.Variable, api['cls_method'])(
-                        x, y
-                    )
-                    paddle.static.append_backward(out)
-
-                    self.assertEqual(x.shape, ())
-                    self.assertEqual(out.shape, ())
-                    if block.has_var(x.grad_name):
-                        out_grad = block.var(out.grad_name)
-                        x_grad = block.var(x.grad_name)
-
-                        self.assertEqual(out_grad.shape, ())
-                        self.assertEqual(x_grad.shape, ())
-
-        for api in binary_int_api_list:
-            main_prog = paddle.static.Program()
-            with paddle.static.program_guard(
-                main_prog, paddle.static.Program()
-            ):
-                # 1) x is 0D, y is 0D
-                x = paddle.randint(-10, 10, [])
-                y = paddle.randint(-10, 10, [])
-                out = api(x, y)
-                self.assertEqual(out.shape, ())
-
-                # 2) x is ND , y is 0D
-                x = paddle.randint(-10, 10, [3, 5])
-                y = paddle.randint(-10, 10, [])
-                out = api(x, y)
-                self.assertEqual(out.shape, (3, 5))
-
-                # 3) x is 0D , y is ND
-                x = paddle.randint(-10, 10, [])
-                y = paddle.randint(-10, 10, [3, 5])
-                out = api(x, y)
-                self.assertEqual(out.shape, (3, 5))
-
-        paddle.disable_static()
-
-
-# Use to test zero-dim of Sundry API, which is unique and can not be classified
-# with others. It can be implemented here flexibly.
-class TestSundryAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.disable_static()
-        self.x = paddle.rand([])
-
-    def test_polygamma(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.polygamma(x, 2)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-    def test_frexp(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out1, out2 = paddle.frexp(x)
-        out1.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-    def test_pairwise_distance(self):
-        x = paddle.rand([5])
-        x.stop_gradient = False
-        y = paddle.rand([5])
-        y.stop_gradient = False
-
-        out = paddle.nn.functional.pairwise_distance(x, y)
-        out.backward()
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [5])
-
-    def test_take(self):
-        x = paddle.rand([4, 5])
-        x.stop_gradient = False
-        out = paddle.take(x, paddle.to_tensor(2))
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [4, 5])
-        np.testing.assert_allclose(x.grad[0, 2], 1.0)
-
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.take(x, paddle.to_tensor(0))
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        np.testing.assert_allclose(out, x)
-        self.assertEqual(x.grad.shape, [])
-        np.testing.assert_allclose(x.grad.numpy(), 1.0)
-
-    def test_trapezoid(self):
-        y = paddle.rand([5])
-        y.stop_gradient = False
-        out = paddle.trapezoid(y, dx=2.0)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(y.grad.shape, [5])
-
-    def test_create_parameter_var(self):
-        zero_dim_param = paddle.create_parameter(shape=[], dtype='float32')
-        self.assertEqual(zero_dim_param.shape, [])
-
-        zero_dim_var = paddle.tensor.creation.create_global_var(
-            shape=[], value=0.5, dtype='float32'
-        )
-        self.assertEqual(zero_dim_var.shape, [])
-        self.assertEqual(zero_dim_var.item(), 0.5)
-
-    def test_getitem(self):
-        # case1: When all axis have a scalar indice, output should be a 0-d Tensor;
-        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
-        x.stop_gradient = False
-        out = x[1, 2, 3, 4]
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(out.shape, [])
-        np.testing.assert_allclose(out, np.array(119))
-        self.assertEqual(out.grad.shape, [])
-        np.testing.assert_allclose(out.grad, 1.0)
-        self.assertEqual(x.grad.shape, [2, 3, 4, 5])
-        x_grad_expected = np.zeros((2, 3, 4, 5))
-        x_grad_expected[1, 2, 3, 4] = 1.0
-        np.testing.assert_allclose(x.grad, x_grad_expected)
-
-        # case2: When one axis has a 0-d Tensor indice, the output should be same as int indice.
-        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
-        out1 = x[1, 2]
-        out2 = x[
-            paddle.full([], 1, dtype='int32'), paddle.full([], 2, dtype='int32')
-        ]
-        np.testing.assert_allclose(out1, out2)
-
-        # case3: When all axis have a scalar indice (i.e. case1) and has None indice,
-        # ndim of output should be same with numbers of None.
-        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
-        out1 = x[1, 2, None, 3, 4]
-        self.assertEqual(out1.shape, [1])
-        np.testing.assert_allclose(out1, np.array([119]))
-        out2 = x[1, None, 2, None, 3, 4]
-        self.assertEqual(out2.shape, [1, 1])
-        np.testing.assert_allclose(out2, np.array([[119]]))
-
-        # case4: 1-D Tensor will be treated as vector, no axis decrease will happen.
-        x = paddle.ones((2, 3, 4))
-        indice = paddle.ones([1], dtype='int32')
-        out1 = x[indice]
-        self.assertEqual(out1.shape, [1, 3, 4])
-        np.testing.assert_allclose(out1, np.ones((1, 3, 4)))
-        out2 = x[indice, indice]
-        self.assertEqual(out2.shape, [1, 4])
-        np.testing.assert_allclose(out2, np.ones((1, 4)))
-
-    def test_setitem(self):
-        # case1: all axis have a scalar indice
-        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
-        x.stop_gradient = False
-        out = x * 2
-        out[1, 2, 3, 4] = 10
-        out.backward()
-
-        self.assertEqual(out.shape, x.shape)
-        np.testing.assert_allclose(out[1, 2, 3, 4], np.array(10))
-        self.assertEqual(x.grad.shape, [2, 3, 4, 5])
-        x_grad_expected = np.ones((2, 3, 4, 5)) * 2
-        x_grad_expected[1, 2, 3, 4] = 0
-        np.testing.assert_allclose(x.grad, x_grad_expected)
-
-        # case2: 0-D Tensor indice in some axis
-        # NOTE(zoooo0820): Now, int/slice with 0-D Tensor will still be
-        # treated as combined indexing, which is not support backward.
-        # There should have more test cases such as out[1, indice, :] = 0.5 when this
-        # problem is fixed.
-        x = paddle.randn((2, 3, 4, 5))
-        x.stop_gradient = False
-        indice = paddle.full([], 1, dtype='int32')
-        out = x * 1
-        out[indice, indice] = 0.5
-        out.backward()
-
-        self.assertEqual(out.shape, x.shape)
-        np.testing.assert_allclose(out[1, 1], np.ones((4, 5)) * 0.5)
-        x_grad_expected = np.ones((2, 3, 4, 5))
-        x_grad_expected[1, 1] = 0
-        np.testing.assert_allclose(x.grad, x_grad_expected)
-
-        # case3：0-D Tensor indice in some axis, value is a Tensor
-        # and there is broadcast
-        x = paddle.randn((2, 3, 4, 5))
-        x.stop_gradient = False
-        v = paddle.ones((4, 5), dtype='float32') * 5
-        v.stop_gradient = False
-        indice = paddle.full([], 1, dtype='int32')
-        out = x * 1
-        out[indice] = v
-        out.backward()
-
-        self.assertEqual(out.shape, x.shape)
-        np.testing.assert_allclose(out[1], np.ones((3, 4, 5)) * 5)
-        x_grad_expected = np.ones((2, 3, 4, 5))
-        x_grad_expected[1] = 0
-        np.testing.assert_allclose(x.grad, x_grad_expected)
-        value_grad_expected = np.ones((4, 5)) * 3
-        np.testing.assert_allclose(v.grad, value_grad_expected)
-
-        # case4: value is a 0-D tensor and there is broadcast
-        x = paddle.randn((2, 3, 4, 5))
-        x.stop_gradient = False
-        v = paddle.ones([], dtype='float32') * 5
-        v.stop_gradient = False
-        out = x * 1
-        indice = paddle.full([], 0, dtype='int32')
-        out[indice] = v
-        out.backward()
-
-        self.assertEqual(out.shape, x.shape)
-        self.assertEqual(v.grad.shape, [])
-        np.testing.assert_allclose(out[0], np.ones((3, 4, 5)) * 5)
-        x_grad_expected = np.ones((2, 3, 4, 5))
-        x_grad_expected[0] = 0
-        np.testing.assert_allclose(x.grad, x_grad_expected)
-        value_grad_expected = np.ones(()) * 3 * 4 * 5
-        np.testing.assert_allclose(v.grad, value_grad_expected)
-
-        # case5: indice / value is 0-D Tensor, and there is no broadcast
-        x = paddle.randn((2, 3, 4, 5))
-        x.stop_gradient = False
-        v = paddle.ones([], dtype='float32') * 2
-        v.stop_gradient = False
-        out = x * 1
-        indice = paddle.full([], 0, dtype='int32')
-        out[indice, indice, indice, indice] = v
-        out.backward()
-
-        self.assertEqual(out.shape, x.shape)
-        self.assertEqual(v.grad.shape, [])
-        np.testing.assert_allclose(out[0, 0, 0, 0], np.ones(()) * 2)
-        x_grad_expected = np.ones((2, 3, 4, 5))
-        x_grad_expected[0, 0, 0, 0] = 0
-        np.testing.assert_allclose(x.grad, x_grad_expected)
-        value_grad_expected = np.ones(())
-        np.testing.assert_allclose(v.grad, value_grad_expected)
-
-    def test_expand(self):
-        # case1
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-        out = paddle.expand(x, shape=[1])
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [1])
-        np.testing.assert_allclose(out, 1.0)
-        self.assertEqual(x.grad.shape, [])
-        np.testing.assert_allclose(x.grad, 1.0)
-        self.assertEqual(out.grad.shape, [1])
-        np.testing.assert_allclose(out.grad, 1.0)
-
-        # case2
-        x1 = paddle.full([], 1, 'float32')
-        x1.stop_gradient = False
-        out1 = paddle.expand(x1, shape=[])
-        out1.retain_grads()
-        out1.backward()
-
-        self.assertEqual(out1.shape, [])
-        np.testing.assert_allclose(out1, 1.0)
-        self.assertEqual(x1.grad.shape, [])
-        np.testing.assert_allclose(x1.grad, 1.0)
-        self.assertEqual(out1.grad.shape, [])
-        np.testing.assert_allclose(out1.grad, 1.0)
-
-        # case3
-        x2 = paddle.full([], 1, 'float32')
-        x2.stop_gradient = False
-        out2 = paddle.expand(x2, shape=[1, 1])
-        out2.retain_grads()
-        out2.backward()
-
-        self.assertEqual(out2.shape, [1, 1])
-        np.testing.assert_allclose(out2, 1.0)
-        self.assertEqual(x2.grad.shape, [])
-        np.testing.assert_allclose(x2.grad, 1.0)
-        self.assertEqual(out2.grad.shape, [1, 1])
-        np.testing.assert_allclose(out2.grad, 1.0)
-
-        # case4
-        x3 = paddle.full([], 1, 'float32')
-        x3.stop_gradient = False
-        out3 = paddle.expand(x3, shape=[3, 3])
-        out3.retain_grads()
-        out3.backward()
-
-        self.assertEqual(out3.shape, [3, 3])
-        np.testing.assert_allclose(out3, 1.0)
-        self.assertEqual(x3.grad.shape, [])
-        np.testing.assert_allclose(x3.grad, 9.0)
-        self.assertEqual(out3.grad.shape, [3, 3])
-        np.testing.assert_allclose(out3.grad, 1.0)
-
-    def test_expand_as(self):
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-        y = paddle.full([], 1, 'float32')
-        y.stop_gradient = False
-        out = paddle.expand_as(x, y)
-        out.backward()
-        self.assertEqual(x.shape, [])
-        self.assertEqual(x.item(), 1.0)
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad.item(), 1.0)
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.item(), 1.0)
-        self.assertEqual(out.grad, None)
-
-        x1 = paddle.full([], 1, 'float32')
-        x1.stop_gradient = False
-        y1 = paddle.full([1], 1, 'float32')
-        out1 = paddle.expand_as(x1, y1)
-        out1.backward()
-        self.assertEqual(x1.shape, [])
-        self.assertEqual(x1.item(), 1.0)
-        self.assertEqual(x1.grad.shape, [])
-        self.assertEqual(x1.grad.item(0), 1.0)
-        self.assertEqual(out1.shape, [1])
-        self.assertEqual(out1.item(0), 1.0)
-        self.assertEqual(out1.grad, None)
-
-        x2 = paddle.full([], 1, 'float32')
-        x2.stop_gradient = False
-        y2 = paddle.full([3, 3], 1, 'float32')
-        out2 = paddle.expand_as(x2, y2)
-        out2.backward()
-        self.assertEqual(x2.shape, [])
-        self.assertEqual(x2.item(), 1.0)
-        self.assertEqual(x2.grad.shape, [])
-        self.assertEqual(x2.grad.item(0), 9.0)
-        self.assertEqual(out2.shape, [3, 3])
-        self.assertEqual(out2.item(0), 1.0)
-        self.assertEqual(out2.grad, None)
-
-    def test_top_k(self):
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-        out, indices = paddle.topk(x, k=1, axis=0)
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(indices.shape, [])
-        self.assertEqual(indices.item(), 0)
-        self.assertEqual(x.shape, [])
-        self.assertEqual(x.item(), 1.0)
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad.item(0), 1.0)
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.item(), 1.0)
-        self.assertEqual(out.grad, 1.0)
-
-        x1 = paddle.full([], 1, 'float32')
-        x1.stop_gradient = False
-        out1, indices1 = paddle.topk(x1, k=1, axis=-1)
-        out1.retain_grads()
-        out1.backward()
-        self.assertEqual(indices1.shape, [])
-        self.assertEqual(indices1.item(), 0)
-        self.assertEqual(x1.shape, [])
-        self.assertEqual(x1.item(), 1.0)
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad.item(0), 1.0)
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out1.item(), 1.0)
-        self.assertEqual(out1.grad, 1.0)
-
-        with self.assertRaises(ValueError):
-            tmp = paddle.topk(x1, k=1, axis=2)
-
-    def test_broadcast_to(self):
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-        out = paddle.broadcast_to(x, shape=[1])
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [1])
-        np.testing.assert_allclose(out, 1.0)
-        self.assertEqual(x.grad.shape, [])
-        np.testing.assert_allclose(x.grad, 1.0)
-        self.assertEqual(out.grad.shape, [1])
-        np.testing.assert_allclose(out.grad, 1.0)
-
-        # case2
-        x1 = paddle.full([], 1, 'float32')
-        x1.stop_gradient = False
-        out1 = paddle.broadcast_to(x1, shape=[])
-        out1.retain_grads()
-        out1.backward()
-
-        self.assertEqual(out1.shape, [])
-        np.testing.assert_allclose(out1, 1.0)
-        self.assertEqual(x1.grad.shape, [])
-        np.testing.assert_allclose(x1.grad, 1.0)
-        self.assertEqual(out1.grad.shape, [])
-        np.testing.assert_allclose(out1.grad, 1.0)
-
-        # case3
-        x2 = paddle.full([], 1, 'float32')
-        x2.stop_gradient = False
-        out2 = paddle.broadcast_to(x2, shape=[1, 1])
-        out2.retain_grads()
-        out2.backward()
-
-        self.assertEqual(out2.shape, [1, 1])
-        np.testing.assert_allclose(out2, 1.0)
-        self.assertEqual(x2.grad.shape, [])
-        np.testing.assert_allclose(x2.grad, 1.0)
-        self.assertEqual(out2.grad.shape, [1, 1])
-        np.testing.assert_allclose(out2.grad, 1.0)
-
-        # case4
-        x3 = paddle.full([], 1, 'float32')
-        x3.stop_gradient = False
-        out3 = paddle.broadcast_to(x3, shape=[3, 3])
-        out3.retain_grads()
-        out3.backward()
-
-        self.assertEqual(out3.shape, [3, 3])
-        np.testing.assert_allclose(out3, 1.0)
-        self.assertEqual(x3.grad.shape, [])
-        np.testing.assert_allclose(x3.grad, 9.0)
-        self.assertEqual(out3.grad.shape, [3, 3])
-        np.testing.assert_allclose(out3.grad, 1.0)
-
-    def test_broadcast_tensors(self):
-        # 1) x is 0D, y is 0D
-        x1 = paddle.full([], 2.0)
-        x1.stop_gradient = False
-        x2 = paddle.full([], 2.0)
-        x2.stop_gradient = False
-        out1, out2 = paddle.broadcast_tensors([x1, x2])
-        # backward has bug now
-        # out1.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        # self.assertEqual(x1.grad.shape, [])
-
-        # 2) x is ND , y is 0D
-        x1 = paddle.full([2, 3], 2.0)
-        x1.stop_gradient = False
-        x2 = paddle.full([], 2.0)
-        x2.stop_gradient = False
-        out1, out2 = paddle.broadcast_tensors([x1, x2])
-        # out1.backward()
-
-        self.assertEqual(out1.shape, [2, 3])
-        self.assertEqual(out2.shape, [2, 3])
-        # self.assertEqual(x1.grad.shape, [2, 3])
-
-        # 3) x is 0D , y is ND
-        x1 = paddle.full([], 2.0)
-        x1.stop_gradient = False
-        x2 = paddle.full([2, 3], 2.0)
-        x2.stop_gradient = False
-        out1, out2 = paddle.broadcast_tensors([x1, x2])
-        # out1.backward()
-
-        self.assertEqual(out1.shape, [2, 3])
-        self.assertEqual(out2.shape, [2, 3])
-        # self.assertEqual(x1.grad.shape, [2, 3])
-
-    def test_broadcast_shape(self):
-        x = []
-        y = [3, 5]
-        out = paddle.broadcast_shape(x, y)
-        self.assertEqual(out, [3, 5])
-
-        x = [3, 5]
-        y = []
-        out = paddle.broadcast_shape(x, y)
-        self.assertEqual(out, [3, 5])
-
-        x = []
-        y = []
-        out = paddle.broadcast_shape(x, y)
-        self.assertEqual(out, [])
-
-        self.assertEqual(out, [])
-
-    def test_argmin(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        out1 = paddle.argmin(x, 0)
-        out2 = paddle.argmin(x, -1)
-        out3 = paddle.argmin(x, None)
-
-        self.assertEqual(out1.shape, [])
-        np.testing.assert_allclose(out1, 0)
-
-        self.assertEqual(out2.shape, [])
-        np.testing.assert_allclose(out2, 0)
-
-        self.assertEqual(out3.shape, [])
-        np.testing.assert_allclose(out3, 0)
-
-        # 2) x is 1D
-        x = paddle.rand([5])
-        x.stop_gradient = False
-        out = paddle.argmin(x, 0)
-        out.backward()
-        self.assertEqual(out.shape, [])
-
-        # 3) x is ND
-        x = paddle.rand([3, 5])
-        x.stop_gradient = False
-        out = paddle.argmin(x)
-        out.backward()
-        self.assertEqual(out.shape, [])
-
-        # 4) x is ND, keepdim=True
-        x = paddle.rand([3, 5])
-        x.stop_gradient = False
-        out = paddle.argmin(x, keepdim=True)
-        out.backward()
-        self.assertEqual(out.shape, [1, 1])
-
-    def test_argmax(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        out1 = paddle.argmax(x, 0)
-        out2 = paddle.argmax(x, -1)
-        out3 = paddle.argmax(x, None)
-
-        self.assertEqual(out1.shape, [])
-        np.testing.assert_allclose(out1, 0)
-
-        self.assertEqual(out2.shape, [])
-        np.testing.assert_allclose(out2, 0)
-
-        self.assertEqual(out3.shape, [])
-        np.testing.assert_allclose(out3, 0)
-
-        # 2) x is 1D
-        x = paddle.rand([5])
-        out = paddle.argmax(x, 0)
-        self.assertEqual(out.shape, [])
-
-        # 3) x is ND
-        x = paddle.rand([3, 5])
-        out = paddle.argmax(x)
-        self.assertEqual(out.shape, [])
-
-        # 4) x is ND, keepdim=True
-        x = paddle.rand([3, 5])
-        out = paddle.argmax(x, keepdim=True)
-        self.assertEqual(out.shape, [1, 1])
-
-    def test_kthvalue(self):
-        # 1) x is 0D
-        x = paddle.randn([])
-        x.stop_gradient = False
-        out, index = paddle.kthvalue(x, 1)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out, x)
-        self.assertEqual(index.shape, [])
-        self.assertEqual(index, 0)
-
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad, 1.0)
-
-        # 2) x is 1D
-        x1 = paddle.randn([5])
-        x1.stop_gradient = False
-        out1, index1 = paddle.kthvalue(x1, 1)
-        out1.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(index1.shape, [])
-        self.assertEqual(x1.grad.shape, [5])
-
-    def test_mode(self):
-        x1 = paddle.randn([5])
-        x1.stop_gradient = False
-        out1, index1 = paddle.mode(x1)
-        out1.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(index1.shape, [])
-
-        self.assertEqual(x1.grad.shape, [5])
-
-    def test_is_empty(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        out = paddle.is_empty(x)
-        self.assertFalse(out)
-        self.assertEqual(out.shape, [])
-
-        # 2) x is 1D
-        x = paddle.rand([5])
-        out = paddle.is_empty(x)
-        self.assertFalse(out)
-        self.assertEqual(out.shape, [])
-
-        # 3) x is ND
-        x = paddle.rand([3, 5])
-        out = paddle.is_empty(x)
-        self.assertFalse(out)
-        self.assertEqual(out.shape, [])
-
-        x = paddle.rand([3, 0, 5])
-        out = paddle.is_empty(x)
-        self.assertTrue(out)
-        self.assertEqual(out.shape, [])
-
-    def test_squeeze_(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        x.squeeze_(0)
-        self.assertEqual(x.shape, [])
-
-        # 2) x is 1D
-        x = paddle.rand([1])
-        x.squeeze_(0)
-        self.assertEqual(x.shape, [])
-
-        # 3）x is ND
-        x = paddle.rand([2, 1])
-        x.squeeze_(1)
-        self.assertEqual(x.shape, [2])
-
-    def test_as_complex(self):
-        x = paddle.rand([2])
-        x.stop_gradient = False
-        out = paddle.as_complex(x)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.shape, [2])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [2])
-        self.assertEqual(out.grad.shape, [])
-
-    def test_dot(self):
-        # 1) x is 1D
-        x = paddle.rand([2])
-        x.stop_gradient = False
-        y = paddle.rand([2])
-        y.stop_gradient = False
-        out = paddle.dot(x, y)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.grad.shape, [2])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-        # 2) x is 2D
-        x1 = paddle.rand([2, 2])
-        x1.stop_gradient = False
-        y1 = paddle.rand([2, 2])
-        y1.stop_gradient = False
-        out1 = paddle.dot(x1, y1)
-        out1.retain_grads()
-        out1.backward()
-
-        self.assertEqual(x1.grad.shape, [2, 2])
-        self.assertEqual(out1.shape, [2])
-        self.assertEqual(out1.grad.shape, [2])
-
-    def test_inner(self):
-        # 0) input is 0D
-        x = paddle.rand([])
-        x.stop_gradient = False
-        y = paddle.rand([])
-        y.stop_gradient = False
-        out = paddle.inner(x, y)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-        # 1) input is 1D
-        x = paddle.rand([2])
-        x.stop_gradient = False
-        y = paddle.rand([2])
-        y.stop_gradient = False
-        out = paddle.inner(x, y)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.grad.shape, [2])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-        # 2) input is 2D
-        x = paddle.rand([2, 3])
-        x.stop_gradient = False
-        y = paddle.rand([3, 3])
-        y.stop_gradient = False
-        out = paddle.inner(x, y)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.grad.shape, [2, 3])
-        self.assertEqual(out.shape, [2, 3])
-        self.assertEqual(out.grad.shape, [2, 3])
-
-    def test_tensordot(self):
-        # 1) input is 1D
-        x = paddle.arange(10, dtype='float64')
-        x.stop_gradient = False
-        y = paddle.arange(10, dtype='float64')
-        y.stop_gradient = False
-        out = paddle.tensordot(x, y, axes=1)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.grad.shape, [10])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-        # 2) input is 2D
-        x = paddle.arange(6, dtype='float64').reshape([2, 3])
-        y = paddle.arange(6, dtype='float64').reshape([2, 3])
-        x.stop_gradient = False
-        out = paddle.tensordot(x, y, axes=2)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.grad.shape, [2, 3])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-    def test_metric_accuracy(self):
-        x = paddle.full(shape=[2, 4], fill_value=0.25)
-        y = paddle.full(shape=[2, 1], fill_value=1, dtype="int64")
-        out = paddle.metric.accuracy(input=x, label=y, k=1)
-        self.assertEqual(out.shape, [])
-
-    def test_std(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out1 = paddle.std(x)
-        out2 = paddle.std(x, [])
-        out1.backward()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out1, 0)
-        self.assertEqual(out2, 0)
-
-        self.assertEqual(x.grad.shape, [])
-
-        # 2) x is ND
-        x = paddle.rand([3, 5])
-        x.stop_gradient = False
-        out = paddle.std(x)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [3, 5])
-
-    def test_var(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out1 = paddle.var(x)
-        out2 = paddle.var(x, [])
-        out1.backward()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out1, 0)
-        self.assertEqual(out2, 0)
-
-        self.assertEqual(x.grad.shape, [])
-        np.testing.assert_allclose(x.grad, 0)
-
-        # 2) x is ND
-        x = paddle.rand([3, 5])
-        x.stop_gradient = False
-        out = paddle.std(x)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [3, 5])
-
-    def test_quantile(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.quantile(x, 0.5, axis=None)
-
-        out.retain_grads()
-        out.backward()
-
-        out_empty_list = paddle.quantile(x, 0.5, axis=[])
-        self.assertEqual(out_empty_list, out)
-
-        self.assertEqual(x.shape, [])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out, x)
-
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad, 1.0)
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(out.grad, 1.0)
-
-        # 2) x is ND
-        x = paddle.rand([2, 3])
-        x.stop_gradient = False
-        out = paddle.quantile(x, 0.5, axis=None)
-
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(out.grad, 1.0)
-        self.assertEqual(x.grad.shape, [2, 3])
-
-    def test_nanquantile(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.quantile(x, 0.5, axis=None)
-
-        out.retain_grads()
-        out.backward()
-
-        out_empty_list = paddle.quantile(x, 0.5, axis=[])
-        self.assertEqual(out_empty_list, out)
-
-        self.assertEqual(x.shape, [])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out, x)
-
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad, 1.0)
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(out.grad, 1.0)
-
-        # 2) x is ND with 'nan'
-        x = paddle.to_tensor([[float('nan'), 2.0, 3.0], [0.0, 1.0, 2.0]])
-        x.stop_gradient = False
-        out = paddle.quantile(x, 0.5, axis=None)
-
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(out.grad, 1.0)
-        self.assertEqual(x.grad.shape, [2, 3])
-
-    def test_flip(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.flip(x, axis=[])
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.shape, [])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-    def test_linear(self):
-        x = paddle.randn([3, 2])
-        w = paddle.full(shape=[2, 4], fill_value=0.5)
-        b = paddle.zeros([])
-
-        np.testing.assert_array_equal(
-            F.linear(x, w, b).numpy(), F.linear(x, w).numpy()
-        )
-
-    def test_is_complex(self):
-        x = paddle.rand([]) + 1j * paddle.rand([])
-        self.assertTrue(paddle.is_complex(x))
-
-    def test_is_floating_point(self):
-        self.assertTrue(paddle.is_floating_point(self.x))
-
-    def test_is_integer(self):
-        x = paddle.randint(0, 10, [])
-        self.assertTrue(paddle.is_integer(x))
-
-    def test_is_tensor(self):
-        self.assertTrue(paddle.is_tensor(self.x))
-
-    def test_isfinite(self):
-        out = paddle.isfinite(self.x)
-        np.testing.assert_array_equal(out.numpy(), np.array(True))
-
-    def test_isinf(self):
-        x = paddle.to_tensor(np.array(float('-inf')))
-        out = paddle.isinf(x)
-        np.testing.assert_array_equal(out.numpy(), np.array(True))
-
-    def test_isnan(self):
-        x = paddle.to_tensor(np.array(float('nan')))
-        out = paddle.isnan(x)
-        np.testing.assert_array_equal(out.numpy(), np.array(True))
-
-    def test_isclose(self):
-        out = paddle.isclose(self.x, self.x)
-        np.testing.assert_array_equal(out.numpy(), np.array(True))
-
-    def test_clone(self):
-        out = paddle.clone(self.x)
-        np.testing.assert_array_equal(out.numpy(), self.x.numpy())
-
-    def test_assign(self):
-        out = paddle.assign(self.x)
-        np.testing.assert_array_equal(out.numpy(), self.x.numpy())
-
-    def test_item(self):
-        x = paddle.full([], 0.5)
-        self.assertEqual(x.item(), 0.5)
-
-    def test_tolist(self):
-        x = paddle.full([], 0.5)
-        self.assertEqual(x.tolist(), 0.5)
-
-    def test_numpy(self):
-        x = paddle.full([], 0.5)
-        x_np = x.numpy()
-        np.testing.assert_array_equal(x_np.shape, ())
-        np.testing.assert_array_equal(x_np, np.array(0.5))
-
-        x_np = x.numpy(False)
-        np.testing.assert_array_equal(x_np.shape, ())
-        np.testing.assert_array_equal(x_np, np.array(0.5))
-
-    def test_numel(self):
-        # 1) x is 0D
-        out = paddle.numel(self.x)
-        self.assertEqual(out.shape, [])
-        np.testing.assert_array_equal(out.numpy(), np.array(1))
-
-        # 2) x is ND
-        x = paddle.full([3, 5], 0.5)
-        out = paddle.numel(x)
-        self.assertEqual(out.shape, [])
-        np.testing.assert_array_equal(out.numpy(), np.array(15))
-
-    def test_rank(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        out = paddle.rank(x)
-        self.assertEqual(out.shape, [])
-        np.testing.assert_array_equal(out.numpy(), np.array(0))
-
-        # 1) x is ND
-        x = paddle.full([3, 5], 0.5)
-        out = paddle.rank(x)
-        self.assertEqual(out.shape, [])
-        np.testing.assert_array_equal(out.numpy(), np.array(2))
-
-    def test_shape(self):
-        out = paddle.shape(self.x)
-        np.testing.assert_array_equal(out.numpy(), np.array([]))
-        self.assertEqual(out.shape, [0])
-
-    def test_equal_scalar(self):
-        x = paddle.rand([])
-        out = paddle.equal(x, 2.0)
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out, False)
-
-        x1 = paddle.full([], 2.0)
-        out1 = paddle.equal(x1, 2.0)
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out1, True)
-
-    def test_pow_scalar(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.pow(x, 2.0)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-    def test_cast(self):
-        x = paddle.full([], 1.0, 'float32')
-        x.stop_gradient = False
-        out = paddle.cast(x, 'int32')
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-    def test_cumprod(self):
-        x = paddle.full([], 1.0, 'float32')
-        x.stop_gradient = False
-        out = paddle.cumprod(x, 0)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-        with self.assertRaises(ValueError):
-            tmp = paddle.cumprod(x, 2)
-
-    def test_clip(self):
-        x = paddle.uniform([], None, -10, 10)
-        x.stop_gradient = False
-        out = paddle.clip(x, -5, 5)
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-        x1 = paddle.uniform([], None, -10, 10)
-        x1.stop_gradient = False
-        out1 = paddle.clip(x1, paddle.full([], -5.0), paddle.full([], 5.0))
-        out1.retain_grads()
-        out1.backward()
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out1.grad.shape, [])
-        self.assertEqual(x1.grad.shape, [])
-
-    def test_increment(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.increment(x, 1.0)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-    def test_bitwise_not(self):
-        x = paddle.randint(-1, 1, [])
-        out1 = ~x
-        out2 = paddle.bitwise_not(x)
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-
-    def test_logical_not(self):
-        x = paddle.randint(0, 1, [])
-        out = paddle.logical_not(x)
-
-        self.assertEqual(out.shape, [])
-
-    def test_searchsorted(self):
-        # have no backward
-        x = paddle.to_tensor([1, 3, 5, 7, 9])
-        y = paddle.rand([])
-
-        out = paddle.searchsorted(x, y)
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.numpy(), 0)
-
-    def test_transpose(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.transpose(x, [])
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out, x)
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad, 1.0)
-
-        with self.assertRaises(ValueError):
-            x = paddle.transpose(x, [0])
-
-    def test_moveaxis(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.moveaxis(x, [], [])
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out, x)
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad, 1.0)
-
-        with self.assertRaises(AssertionError):
-            x = paddle.moveaxis(x, [1], [0])
-
-    def test_gather_1D(self):
-        x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
-        index = paddle.full([], 2, 'int64')
-        out = paddle.gather(x, index)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.numpy(), 5)
-        self.assertEqual(x.grad.shape, [5])
-        self.assertEqual(out.grad.shape, [])
-
-    def test_gather_xD_axis_0(self):
-        x = paddle.to_tensor(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
-        )
-        index = paddle.full([], 1, 'int64')
-        out = paddle.gather(x, index)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [3])
-        np.testing.assert_array_equal(out.numpy(), x.numpy()[1, :])
-        self.assertEqual(x.grad.shape, [2, 3])
-        self.assertEqual(out.grad.shape, [3])
-
-    def test_gather_xD_axis_1(self):
-        x = paddle.to_tensor(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
-        )
-        index = paddle.full([], 1, 'int64')
-        out = paddle.gather(x, index, axis=1)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [2])
-        np.testing.assert_array_equal(out.numpy(), [2.0, 5.0])
-        self.assertEqual(x.grad.shape, [2, 3])
-        self.assertEqual(out.grad.shape, [2])
-
-    def test_gather_nd(self):
-        x1 = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
-        x2 = paddle.to_tensor(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
-        )
-
-        index1 = paddle.full([1], 1, 'int64')
-        index2 = paddle.full([2], 1, 'int64')
-
-        out1 = paddle.gather_nd(x1, index1)
-        out2 = paddle.gather_nd(x2, index2)
-
-        out1.retain_grads()
-        out2.retain_grads()
-
-        out1.backward()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        np.testing.assert_array_equal(out1, np.array(3.0))
-        np.testing.assert_array_equal(out2, np.array(5.0))
-        self.assertEqual(x1.grad.shape, [5])
-        self.assertEqual(x2.grad.shape, [2, 3])
-        self.assertEqual(out1.grad.shape, [])
-        self.assertEqual(out2.grad.shape, [])
-
-    def test_einsum(self):
-        os.environ['FLAGS_new_einsum'] = "0"
-        x = paddle.rand([5])
-        # sum
-        out1 = paddle.einsum('i->', x)
-        expect1 = np.einsum('i->', x)
-        # dot
-        out2 = paddle.einsum('i,i->', x, x)
-        expect2 = np.einsum('i,i->', x, x)
-
-        out1.retain_grads()
-        out2.retain_grads()
-
-        out1.backward()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        np.testing.assert_allclose(out1, expect1, rtol=1e-03)
-        np.testing.assert_allclose(out2, expect2, rtol=1e-03)
-
-    def test_einsum_V2(self):
-        os.environ['FLAGS_new_einsum'] = "1"
-        x = paddle.rand([5])
-        # sum
-        out1 = paddle.einsum('i->', x)
-        expect1 = np.einsum('i->', x)
-        # dot
-        out2 = paddle.einsum('i,i->', x, x)
-        expect2 = np.einsum('i,i->', x, x)
-
-        out1.retain_grads()
-        out2.retain_grads()
-
-        out1.backward()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        np.testing.assert_allclose(out1, expect1, rtol=1e-03)
-        np.testing.assert_allclose(out2, expect2, rtol=1e-03)
-
-    def test_scatter_1D(self):
-        x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0], stop_gradient=False)
-        index = paddle.full([], 2, 'int64')
-        updates = paddle.full([], 4.0)
-        out = paddle.scatter(x, index, updates)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [5])
-        self.assertEqual(out.numpy()[2], 4)
-        self.assertEqual(out.grad.shape, [5])
-
-    def test_scatter_XD(self):
-        x = paddle.to_tensor(
-            [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], stop_gradient=False
-        )
-        index = paddle.full([], 1, 'int64')
-        updates = paddle.to_tensor([1.0, 2.0, 3.0])
-        out = paddle.scatter(x, index, updates)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [2, 3])
-        np.testing.assert_array_equal(out.numpy()[1], [1.0, 2.0, 3.0])
-        self.assertEqual(out.grad.shape, [2, 3])
-
-    def test_scatter_shape_check(self):
-        x = paddle.to_tensor([1.0, 2.0, 3.0])
-        index = paddle.to_tensor(1)
-        updates = paddle.to_tensor([3.0])
-        with self.assertRaises(ValueError):
-            out = paddle.scatter(x, index, updates)
-
-        x = paddle.to_tensor([[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]])
-        index = paddle.to_tensor(1)
-        updates = paddle.to_tensor([[5.0, 5.0]])
-        with self.assertRaises(ValueError):
-            out = paddle.scatter(x, index, updates)
-
-    def test_scatter_0D_index(self):
-        x = paddle.to_tensor([1.0, 2.0, 3.0], stop_gradient=False)
-        index = paddle.to_tensor(1)
-        updates = paddle.to_tensor(3.0)
-        out = paddle.scatter(x, index, updates)
-        out.backward()
-        np.testing.assert_array_equal(x.grad.numpy()[1], 0.0)
-
-        x = paddle.to_tensor(
-            [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]], stop_gradient=False
-        )
-        index = paddle.to_tensor(1)
-        updates = paddle.to_tensor([5.0, 5.0])
-        out = paddle.scatter(x, index, updates)
-        out.backward()
-        np.testing.assert_array_equal(x.grad.numpy()[1], [0.0, 0.0])
-
-    def test_diagflat(self):
-        x1 = paddle.rand([])
-        x2 = paddle.rand([])
-        x3 = paddle.rand([])
-
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-        x3.stop_gradient = False
-
-        x1.retain_grads()
-        x2.retain_grads()
-        x3.retain_grads()
-
-        out1 = paddle.diagflat(x1, 1)
-        out2 = paddle.diagflat(x2, -1)
-        out3 = paddle.diagflat(x3, 0)
-
-        out1.retain_grads()
-        out2.retain_grads()
-        out3.retain_grads()
-
-        out1.backward()
-        out2.backward()
-        out3.backward()
-
-        self.assertEqual(out1.shape, [2, 2])
-        self.assertEqual(out2.shape, [2, 2])
-        self.assertEqual(out3.shape, [1, 1])
-
-        self.assertEqual(out1.grad.shape, [2, 2])
-        self.assertEqual(out2.grad.shape, [2, 2])
-        self.assertEqual(out3.grad.shape, [1, 1])
-
-        self.assertEqual(x1.grad.shape, [])
-        self.assertEqual(x2.grad.shape, [])
-        self.assertEqual(x3.grad.shape, [])
-
-    def test_scatter__1D(self):
-        x = paddle.to_tensor([1.0, 3.0, 5.0, 7.0, 9.0])
-        index = paddle.full([], 2, 'int64')
-        updates = paddle.full([], 4.0)
-        out = paddle.scatter_(x, index, updates)
-
-        self.assertEqual(out.numpy()[2], 4)
-
-    def test_scatter__XD(self):
-        x = paddle.to_tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-        index = paddle.full([], 1, 'int64')
-        updates = paddle.to_tensor([1.0, 2.0, 3.0])
-        out = paddle.scatter_(x, index, updates)
-        np.testing.assert_array_equal(out.numpy()[1], [1.0, 2.0, 3.0])
-
-    def test_scatter_nd(self):
-        index = paddle.to_tensor([3], dtype="int64")
-        updates = paddle.full([], 2, dtype='float32')
-        updates.retain_grads()
-        updates.stop_gradient = False
-
-        out = paddle.scatter_nd(index, updates, [5])
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [5])
-        self.assertEqual(out.numpy()[3], 2)
-        self.assertEqual(out.grad.shape, [5])
-        self.assertEqual(updates.grad.shape, [])
-
-    def test_flatten(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-
-        start_axis = 0
-        stop_axis = -1
-
-        out = paddle.flatten(x, start_axis=start_axis, stop_axis=stop_axis)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [1])
-        self.assertEqual(out.grad.shape, [1])
-        self.assertEqual(x.grad.shape, [])
-
-    def test_histogram(self):
-        x = paddle.rand([])
-        out = paddle.histogram(x, bins=5, min=1, max=5)
-        self.assertEqual(out.shape, [5])
-
-    def test_scale(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.scale(x, scale=2.0, bias=1.0)
-
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-    def test_scale_(self):
-        x = paddle.rand([])
-        out = x.scale_(scale=2.0, bias=1.0)
-        self.assertEqual(out.shape, [])
-
-    def test_floor_divide(self):
-        # 1-d // 0-d
-        x = paddle.to_tensor([1, -2, 3], dtype="int64")
-        y = paddle.full([], 2, dtype='int64')
-        out1_1 = paddle.floor_divide(x, y)
-        out1_2 = paddle.Tensor.__floordiv__(x, y)
-
-        np.testing.assert_array_equal(out1_1.numpy(), out1_2.numpy())
-        np.testing.assert_array_equal(out1_1.numpy(), np.asarray([0, -1, 1]))
-
-        # 0-d // 1-d
-        out2_1 = paddle.floor_divide(y, x)
-        out2_2 = paddle.Tensor.__floordiv__(y, x)
-
-        np.testing.assert_array_equal(out2_1.numpy(), out2_2.numpy())
-        np.testing.assert_array_equal(out2_2.numpy(), np.asarray([2, -1, 0]))
-
-        # 0-d // 0-d
-        x = paddle.full([], 3, dtype='int64')
-        out3_1 = paddle.floor_divide(x, y)
-        out3_2 = paddle.Tensor.__floordiv__(x, y)
-
-        np.testing.assert_array_equal(out3_1.numpy(), out3_2.numpy())
-        np.testing.assert_array_equal(out3_2.numpy(), np.asarray(1))
-
-    def test_cumsum(self):
-        x1 = paddle.rand([])
-        x1.stop_gradient = False
-
-        out1 = paddle.cumsum(x1)
-        out2 = paddle.cumsum(x1, axis=0)
-        out3 = paddle.cumsum(x1, axis=-1)
-
-        out1.retain_grads()
-        out2.retain_grads()
-        out3.retain_grads()
-
-        out1.backward()
-        out2.backward()
-        out3.backward()
-
-        self.assertEqual(x1.grad.shape, [])
-        self.assertTrue(x1.grad.numpy() == 3)
-        self.assertEqual(out1.shape, [1])
-        self.assertEqual(out1.grad.shape, [1])
-        self.assertTrue(out1.grad.numpy() == 1)
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out2.grad.shape, [])
-        self.assertTrue(out2.grad.numpy() == 1)
-        self.assertEqual(out3.shape, [])
-        self.assertEqual(out3.grad.shape, [])
-        self.assertTrue(out3.grad.numpy() == 1)
-
-    def test_logcumsumexp(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-
-        out1 = paddle.logcumsumexp(x)
-        out2 = paddle.logcumsumexp(x, axis=0)
-        out3 = paddle.logcumsumexp(x, axis=-1)
-
-        out1.backward()
-        out2.backward()
-        out3.backward()
-
-        self.assertEqual(out1.shape, [1])
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out3.shape, [])
-
-        self.assertEqual(x.grad.shape, [])
-        self.assertTrue(x.grad.numpy() == 3)
-
-    def test_add_n(self):
-        x1 = paddle.rand([])
-        x1.stop_gradient = False
-        x2 = paddle.rand([])
-        x2.stop_gradient = False
-        x3 = paddle.rand([])
-        x3.stop_gradient = False
-
-        out1 = paddle.add_n(x1)
-        out2 = paddle.add_n([x2, x3])
-
-        out1.retain_grads()
-        out2.retain_grads()
-
-        out1.backward()
-        out2.backward()
-
-        self.assertEqual(x1.grad.shape, [])
-        self.assertTrue(x1.grad.numpy() == 1)
-        self.assertEqual(x2.grad.shape, [])
-        self.assertTrue(x2.grad.numpy() == 1)
-        self.assertEqual(x3.grad.shape, [])
-        self.assertTrue(x3.grad.numpy() == 1)
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out1.grad.shape, [])
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out2.grad.shape, [])
-
-    def test_reshape_list(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.reshape(x, [])
-
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-        out = paddle.reshape(x, [1])
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(out.shape, [1])
-        self.assertEqual(out.grad.shape, [1])
-
-        out = paddle.reshape(x, [-1])
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(out.shape, [1])
-        self.assertEqual(out.grad.shape, [1])
-
-        out = paddle.reshape(x, [-1, 1])
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(out.shape, [1, 1])
-        self.assertEqual(out.grad.shape, [1, 1])
-
-    def test_reshape_tensor(self):
-        x = paddle.rand([1, 1])
-        x.stop_gradient = False
-        out = paddle.reshape(x, [])
-
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.grad.shape, [1, 1])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-        new_shape = paddle.to_tensor([1, 1, 1], "int32")
-        out = paddle.reshape(x, new_shape)
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.grad.shape, [1, 1])
-        self.assertEqual(out.shape, [1, 1, 1])
-        self.assertEqual(out.grad.shape, [1, 1, 1])
-
-        new_shape = paddle.to_tensor([-1], "int32")
-        out = paddle.reshape(x, new_shape)
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.grad.shape, [1, 1])
-        self.assertEqual(out.shape, [1])
-        self.assertEqual(out.grad.shape, [1])
-
-        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
-        out = paddle.reshape(x, new_shape)
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.grad.shape, [1, 1])
-        self.assertEqual(out.shape, [1, 1])
-        self.assertEqual(out.grad.shape, [1, 1])
-
-    def test_reshape__list(self):
-        x = paddle.rand([])
-        out = paddle.reshape_(x, [])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.reshape_(x, [1])
-        self.assertEqual(out.shape, [1])
-
-        out = paddle.reshape_(x, [-1])
-        self.assertEqual(out.shape, [1])
-
-        out = paddle.reshape_(x, [-1, 1])
-        self.assertEqual(out.shape, [1, 1])
-
-    def test_reshape__tensor(self):
-        x = paddle.rand([1, 1])
-        out = paddle.reshape_(x, [])
-        self.assertEqual(out.shape, [])
-
-        new_shape = paddle.full([1], 1, "int32")
-        out = paddle.reshape_(x, new_shape)
-        self.assertEqual(out.shape, [1])
-
-        new_shape = paddle.full([1], -1, "int32")
-        out = paddle.reshape_(x, new_shape)
-        self.assertEqual(out.shape, [1])
-
-        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
-        out = paddle.reshape_(x, new_shape)
-        self.assertEqual(out.shape, [1, 1])
-
-    def test_reverse(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.reverse(x, axis=[])
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(x.shape, [])
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-
-    def test_sort(self):
-        x1 = paddle.rand([])
-        x2 = paddle.rand([])
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-        x1.retain_grads()
-        x2.retain_grads()
-        out1 = paddle.sort(x1, axis=-1)
-        out2 = paddle.sort(x2, axis=0)
-
-        out1.retain_grads()
-        out2.retain_grads()
-
-        out1.backward()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out1.numpy(), x1.numpy())
-        self.assertEqual(out2.numpy(), x2.numpy())
-        self.assertEqual(out1.grad.shape, [])
-        self.assertEqual(out2.grad.shape, [])
-        self.assertEqual(x1.grad.shape, [])
-        self.assertEqual(x2.grad.shape, [])
-        self.assertEqual(x1.grad.numpy(), 1)
-        self.assertEqual(x2.grad.numpy(), 1)
-
-    def test_argsort(self):
-        x1 = paddle.rand([])
-        x2 = paddle.rand([])
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-        x1.retain_grads()
-        x2.retain_grads()
-
-        out1 = paddle.argsort(x1, axis=-1)
-        out2 = paddle.argsort(x2, axis=0)
-
-        out1.retain_grads()
-        out2.retain_grads()
-
-        out1.backward()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out1.numpy(), 0)
-        self.assertEqual(out2.numpy(), 0)
-        self.assertEqual(out1.grad.shape, [])
-        self.assertEqual(out2.grad.shape, [])
-        self.assertEqual(x1.grad.shape, [])
-        self.assertEqual(x2.grad.shape, [])
-        self.assertEqual(x1.grad.numpy(), 0)
-        self.assertEqual(x2.grad.numpy(), 0)
-
-    def test_lerp(self):
-        # 0D + 0D, weight is float scalar
-        x = paddle.rand([])
-        y = paddle.rand([])
-        x.stop_gradient = False
-        y.stop_gradient = False
-        out = paddle.lerp(x, y, 0.5)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(y.grad.shape, [])
-
-        # 0D + 0D, weigh is 0D
-        x0 = paddle.rand([])
-        y0 = paddle.rand([])
-        w0 = paddle.rand([])
-        x0.stop_gradient = False
-        y0.stop_gradient = False
-        y0.retain_grads()
-
-        out0 = paddle.lerp(x0, y0, w0)
-        out0.backward()
-
-        self.assertEqual(out0.shape, [])
-        self.assertEqual(x0.grad.shape, [])
-        self.assertEqual(y0.grad.shape, [])
-
-        # 0D + ND
-        x1 = paddle.rand([])
-        y1 = paddle.rand([64, 64])
-        w1 = paddle.rand([])
-        x1.stop_gradient = False
-        y1.stop_gradient = False
-        x1.retain_grads()
-        y1.retain_grads()
-
-        out1 = paddle.lerp(x1, y1, w1)
-        out1.backward()
-
-        self.assertEqual(out1.shape, [64, 64])
-        self.assertEqual(x1.grad.shape, [])
-        self.assertEqual(y1.grad.shape, [64, 64])
-
-        # ND + 0D
-        x2 = paddle.rand([64, 64])
-        y2 = paddle.rand([])
-        w2 = paddle.rand([])
-        x2.stop_gradient = False
-        y2.stop_gradient = False
-        x2.retain_grads()
-        y2.retain_grads()
-
-        out2 = paddle.lerp(x2, y2, w2)
-        out2.backward()
-
-        self.assertEqual(out2.shape, [64, 64])
-        self.assertEqual(x2.grad.shape, [64, 64])
-        self.assertEqual(y2.grad.shape, [])
-
-    def test_repeat_interleave(self):
-        places = ['cpu']
-        if paddle.is_compiled_with_cuda():
-            places.append('gpu')
-        for place in places:
-            paddle.set_device(place)
-
-            x = paddle.randn(())
-            x.stop_gradient = False
-
-            out = paddle.repeat_interleave(x, 2, None)
-            out.backward()
-
-            # check shape of output
-            self.assertEqual(out.shape, [2])
-
-            # check grad shape
-            self.assertEqual(x.grad.shape, [])
-
-            repeats = paddle.to_tensor([3], dtype='int32')
-            out = paddle.repeat_interleave(x, repeats, None)
-
-            # check shape of output with 1D repeats
-            self.assertEqual(out.shape, [3])
-
-            # check grad shape with 1D repeats
-            self.assertEqual(x.grad.shape, [])
-
-    def test_allclose(self):
-        # 1) x is 0D
-        x = paddle.full([], 0.5)
-        y = paddle.full([], 0.6)
-        out = paddle.allclose(x, y)
-        self.assertEqual(out.shape, [])
-        self.assertFalse(out)
-
-        # 2) x is ND
-        x = paddle.full([2, 3], 0.5)
-        y = paddle.full([2, 3], 0.6)
-        out = paddle.allclose(x, y)
-        self.assertEqual(out.shape, [])
-        self.assertFalse(out)
-
-    def test_equal_all(self):
-        # 1) x is 0D
-        x = paddle.full([], 0.5)
-        y = paddle.full([], 0.6)
-        out = paddle.equal_all(x, y)
-        self.assertEqual(out.shape, [])
-        self.assertFalse(out)
-
-        # 2) x is ND
-        x = paddle.full([2, 3], 0.5)
-        y = paddle.full([2, 3], 0.6)
-        out = paddle.equal_all(x, y)
-        self.assertEqual(out.shape, [])
-        self.assertFalse(out)
-
-    def test_where(self):
-        x1 = paddle.full([], 1)
-        x2 = paddle.full([], 2)
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-        x1.retain_grads()
-        x2.retain_grads()
-        out = paddle.where(x1 > x2, x1, x2)
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.numpy(), 2)
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x1.grad.shape, [])
-        self.assertEqual(x2.grad.shape, [])
-        self.assertEqual(x1.grad.numpy(), 0)
-        self.assertEqual(x2.grad.numpy(), 1)
-
-    def test_atan2(self):
-        x1 = paddle.full([], 0)
-        x2 = paddle.full([], 2)
-        x1.retain_grads()
-        x2.retain_grads()
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-        out = paddle.atan2(x1, x2)
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.numpy(), 0)
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x1.grad.shape, [])
-        self.assertEqual(x2.grad.shape, [])
-        self.assertEqual(x1.grad.numpy(), 0.5)
-        self.assertEqual(x2.grad.numpy(), 0)
-
-    def test_interpolate(self):
-        from paddle.nn.functional import interpolate
-
-        input_x = paddle.rand([2, 3, 6, 6])
-        input_x.stop_gradient = False
-        origin_result = interpolate(
-            x=input_x, size=[12, 12], mode="bilinear", align_corners=False
-        )
-
-        output_size = [
-            paddle.full([], 12, dtype="int32"),
-            paddle.full([], 12, dtype="int32"),
-        ]
-        out1 = interpolate(
-            x=input_x, size=output_size, mode="bilinear", align_corners=False
-        )
-        out1.backward()
-
-        self.assertEqual(out1.shape, [2, 3, 12, 12])
-        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
-
-        scale_1 = [paddle.full([], 2), paddle.full([], 2)]
-        out2 = interpolate(
-            x=input_x,
-            scale_factor=scale_1,
-            mode="bilinear",
-            align_corners=False,
-        )
-        out2.backward()
-
-        self.assertEqual(out2.shape, [2, 3, 12, 12])
-        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
-
-        scale_2 = paddle.full([], 2)
-        out3 = interpolate(
-            x=input_x,
-            scale_factor=scale_2,
-            mode="bilinear",
-            align_corners=False,
-        )
-        out3.backward()
-
-        # for coverage
-        scale_3 = paddle.full([1], 2)
-        input_3d = paddle.rand([2, 3, 6])
-        out4 = interpolate(
-            x=input_3d,
-            scale_factor=scale_3,
-            mode="LINEAR",
-            align_corners=False,
-            data_format="NCW",
-        )
-
-        self.assertEqual(out3.shape, [2, 3, 12, 12])
-        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
-
-        np.testing.assert_allclose(
-            origin_result.numpy(), out1.numpy(), rtol=1e-05
-        )
-        np.testing.assert_allclose(
-            origin_result.numpy(), out2.numpy(), rtol=1e-05
-        )
-        np.testing.assert_allclose(
-            origin_result.numpy(), out3.numpy(), rtol=1e-05
-        )
-
-    def test_upsample(self):
-        from paddle.nn.functional import upsample
-
-        input_x = paddle.rand([2, 3, 6, 6])
-        input_x.stop_gradient = False
-
-        output_size = [
-            paddle.full([], 12, dtype="int32"),
-            paddle.full([], 12, dtype="int32"),
-        ]
-        out1 = upsample(
-            x=input_x, size=output_size, mode="bilinear", align_corners=False
-        )
-        out1.backward()
-
-        self.assertEqual(out1.shape, [2, 3, 12, 12])
-        self.assertEqual(input_x.grad.shape, [2, 3, 6, 6])
-
-    def test_unstack(self):
-        x1 = paddle.full([1], 0)
-        x2 = paddle.full([2], 2)
-        x1.retain_grads()
-        x2.retain_grads()
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-
-        [out1] = paddle.unstack(x1, 0)
-        out1.retain_grads()
-        out1.backward()
-        [out2_1, out2_2] = paddle.unstack(x2, 0)
-        out2 = paddle.add_n([out2_1, out2_2])
-        out2.retain_grads()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out1.numpy(), 0)
-
-        self.assertEqual(out2_1.shape, [])
-        self.assertEqual(out2_1.numpy(), 2)
-        self.assertEqual(out2_2.shape, [])
-        self.assertEqual(out2_2.numpy(), 2)
-        self.assertEqual(x2.grad.shape, [2])
-
-    def test_unbind(self):
-        x1 = paddle.full([1], 0)
-        x2 = paddle.full([2], 2)
-        x1.retain_grads()
-        x2.retain_grads()
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-
-        [out1] = paddle.unbind(x1, 0)
-        out1.retain_grads()
-        out1.backward()
-        [out2_1, out2_2] = paddle.unbind(x2, 0)
-        out2 = paddle.add_n([out2_1, out2_2])
-        out2.retain_grads()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out1.numpy(), 0)
-
-        self.assertEqual(out2_1.shape, [])
-        self.assertEqual(out2_1.numpy(), 2)
-        self.assertEqual(out2_2.shape, [])
-        self.assertEqual(out2_2.numpy(), 2)
-        self.assertEqual(x2.grad.shape, [2])
-
-    def test_masked_select(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        mask = paddle.full([], True, dtype='bool')
-        y = paddle.masked_select(x, mask)
-
-        y.retain_grads()
-        y.backward()
-        self.assertEqual(y.shape, [1])
-        self.assertEqual(y.numpy(), x.numpy())
-        self.assertEqual(y.grad.shape, [1])
-        self.assertEqual(x.grad.shape, [])
-        self.assertEqual(x.grad.numpy(), 1)
-
-    def test_squeeze(self):
-        x1 = paddle.full([], 2)
-        x1.stop_gradient = False
-        x1.retain_grads()
-        out1 = paddle.squeeze(x1, axis=0)
-        out1.retain_grads()
-        out1.backward()
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(x1.grad.shape, [])
-
-        x2 = paddle.full([], 3)
-        x3 = paddle.full([1], 0, dtype='int32')
-        x2.stop_gradient = False
-        x2.retain_grads()
-        out2 = paddle.squeeze(x2, axis=x3)
-        out2.retain_grads()
-        out2.backward()
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(x2.grad.shape, [])
-
-    def test_unsqueeze(self):
-        x1 = paddle.full([], 2)
-        x1.stop_gradient = False
-        x1.retain_grads()
-        out1 = paddle.unsqueeze(x1, axis=0)
-        out1.retain_grads()
-        out1.backward()
-        self.assertEqual(out1.shape, [1])
-        self.assertEqual(x1.grad.shape, [])
-
-        x2 = paddle.full([], 0, dtype='int32')
-        out2 = paddle.unsqueeze(x1, axis=x2)
-        out2.retain_grads()
-        out2.backward()
-        self.assertEqual(out2.shape, [1])
-        self.assertEqual(x1.grad.shape, [])
-
-    def test_t(self):
-        x = paddle.full([], 2.0)
-        x.stop_gradient = False
-        x.retain_grads()
-        out = paddle.t(x)
-        out.retain_grads()
-        out.backward()
-        self.assertEqual(out.shape, [])
-        self.assertEqual(out.grad.shape, [])
-        self.assertEqual(x.grad.shape, [])
-
-    def test_prelu(self):
-        x1 = paddle.full([], 1.0, 'float32')
-        x1.stop_gradient = False
-        w1 = paddle.full([], 0.25, dtype='float32')
-        out1 = paddle.nn.functional.prelu(x1, w1)
-        out1.retain_grads()
-        out1.backward()
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out1.numpy(), 1.0)
-        self.assertEqual(out1.grad.shape, [])
-        self.assertEqual(x1.grad.shape, [])
-        self.assertEqual(x1.grad.numpy(), 1.0)
-
-        x2 = paddle.full([], -1.0, 'float32')
-        x2.stop_gradient = False
-        w2 = paddle.full([], 0.25, dtype='float32')
-        out2 = paddle.nn.functional.prelu(x2, w2)
-        out2.retain_grads()
-        out2.backward()
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out2.numpy(), -0.25)
-        self.assertEqual(out2.grad.shape, [])
-        self.assertEqual(x2.grad.shape, [])
-        self.assertEqual(x2.grad.numpy(), 0.25)
-
-    @test_with_pir_api
-    def test_while_loop(self):
-        def cond(i, x):
-            return paddle.less_than(i, eleven)
-
-        def body(i, x):
-            x = x + i
-            i = i + 1
-            return [i, x]
-
-        i = paddle.full([], 1.0, dtype='float32')
-        i.stop_gradient = False
-        i.persistable = True
-        eleven = paddle.full([], 11, dtype='float32')
-        x = paddle.full([], 0.0, dtype='float32')
-        x.stop_gradient = False
-        x.persistable = True
-        out_i, out_x = paddle.static.nn.while_loop(cond, body, [i, x])
-
-        if in_dynamic_mode():
-            out_x.backward()
-            di = i.grad
-            dx = x.grad
-        else:
-            grad_list = paddle.static.append_backward(out_x)
-            for p, g in grad_list:
-                if p.is_same(i):
-                    di = g
-                elif p.is_same(x):
-                    dx = g
-            place = (
-                base.CUDAPlace(0)
-                if core.is_compiled_with_cuda()
-                else base.CPUPlace()
-            )
-            exe = base.Executor(place)
-            main_program = paddle.static.default_main_program()
-            out_i, out_x, di, dx = exe.run(
-                main_program, feed={}, fetch_list=[out_i, out_x, di, dx]
-            )
-
-        self.assertEqual(np.asarray(out_i).shape, ())
-        np.testing.assert_allclose(out_i, np.array(11))
-        self.assertEqual(np.asarray(out_x).shape, ())
-        np.testing.assert_allclose(out_x, np.array(55))
-        self.assertEqual(np.asarray(di).shape, ())
-        np.testing.assert_allclose(di, np.array(10))
-        self.assertEqual(np.asarray(dx).shape, ())
-        np.testing.assert_allclose(dx, np.array(1.0))
-
-    def test_to_tensor(self):
-        out1 = paddle.to_tensor(1)
-        out2 = paddle.to_tensor(2.5)
-
-        out1.retain_grads()
-        out1.backward()
-        out2.retain_grads()
-        out2.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out1, 1)
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(out2, 2.5)
-
-    def test_matmul(self):
-        # 1) no transpose
-        x = paddle.randn([10])
-        x.stop_gradient = False
-        y = paddle.randn([10])
-        y.stop_gradient = False
-        out1 = paddle.matmul(x, y)
-        out1.retain_grads()
-        out1.backward()
-
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(x.grad.shape, [10])
-        self.assertEqual(y.grad.shape, [10])
-
-        # 2) transpose x and y
-        x = paddle.randn([10])
-        x.stop_gradient = False
-        y = paddle.randn([10])
-        y.stop_gradient = False
-        out2 = paddle.matmul(x, y, True, True)
-        out2.retain_grads()
-        out2.backward()
-
-        self.assertEqual(out2.shape, [])
-        self.assertEqual(x.grad.shape, [10])
-        self.assertEqual(y.grad.shape, [10])
-
-    def test_linalg_slogdet(self):
-        # 2-D input
-        x = paddle.randn([3, 3])
-        x.stop_gradient = False
-        out = paddle.linalg.slogdet(x)
-        out.retain_grads()
-        out.backward()
-
-        self.assertTrue(out.shape, [2])
-        self.assertTrue(x.grad.shape, [3, 3])
-
-        # 3-D input
-        x1 = paddle.randn([3, 3, 3])
-        x1.stop_gradient = False
-        out1 = paddle.linalg.slogdet(x1)
-        out1.retain_grads()
-        out1.backward()
-
-        self.assertTrue(out1.shape, [2, 3])
-        self.assertTrue(x1.grad.shape, [3, 3, 3])
-
-    def test_multi_dot(self):
-        a = paddle.randn([4])
-        a.stop_gradient = False
-        b = paddle.randn([4, 5])
-        b.stop_gradient = False
-        c = paddle.randn([5])
-        c.stop_gradient = False
-
-        out = paddle.linalg.multi_dot([a, b, c])
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(a.grad.shape, [4])
-        self.assertEqual(b.grad.shape, [4, 5])
-        self.assertEqual(c.grad.shape, [5])
-
-    def test_cov(self):
-        xt = paddle.randn((3, 4))
-        xt.stop_gradient = False
-        xt_1 = paddle.randn((12,))
-        xt_1.stop_gradient = False
-
-        xt_out = paddle.linalg.cov(xt)
-        xt_out.retain_grads()
-        xt_out.backward()
-        self.assertEqual(xt_out.shape, [3, 3])
-        self.assertEqual(xt.grad.shape, [3, 4])
-
-        xt_1_out = paddle.linalg.cov(xt_1)
-        xt_1.retain_grads()
-        xt_1_out.backward()
-        self.assertEqual(xt_1_out.shape, [])
-        self.assertEqual(xt_1.grad.shape, [12])
-
-    def test_corrcoef(self):
-        x = paddle.randn((12,))
-        x.stop_gradient = False
-        out = paddle.linalg.corrcoef(x)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        self.assertEqual(x.grad.shape, [12])
-
-    def test_det(self):
-        xt = paddle.randn([3, 3, 3])
-        xt.stop_gradient = False
-        xt_1 = paddle.randn([3, 3])
-        xt_1.stop_gradient = False
-
-        xt_out = paddle.linalg.det(xt)
-        xt.retain_grads()
-        xt_out.backward()
-        self.assertEqual(xt_out.shape, [3])
-        self.assertEqual(xt.grad.shape, [3, 3, 3])
-
-        xt_1_out = paddle.linalg.det(xt_1)
-        xt_1.retain_grads()
-        xt_1_out.backward()
-        self.assertEqual(xt_1_out.shape, [])
-        self.assertEqual(xt_1.grad.shape, [3, 3])
-
-    def test_dist(self):
-        x = paddle.to_tensor([[3, 3], [3, 3]], dtype="float32")
-        y = paddle.to_tensor([[3, 3], [3, 1]], dtype="float32")
-        x.stop_gradient = False
-        y.stop_gradient = False
-        out = paddle.dist(x, y, 0)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        np.testing.assert_allclose(out, np.array(1))
-        self.assertEqual(x.grad.shape, [2, 2])
-        self.assertEqual(y.grad.shape, [2, 2])
-
-    def test_linalg_norm(self):
-        # 1D input, p = fro ,axis = None, using reduceInferMeta
-        x_1 = paddle.arange(24, dtype="float32") - 12
-        x_1.stop_gradient = False
-        out_1 = paddle.linalg.norm(x_1)
-        out_1.retain_grads()
-        out_1.backward()
-
-        self.assertEqual(out_1.shape, [])
-        self.assertTrue(x_1.grad.shape, [24])
-
-        # 1D input, p = 1 ,axis = None,
-        # using p_norm, as_vector = True
-        x_2 = paddle.arange(24, dtype="float32") - 12
-        x_2.stop_gradient = False
-        out_2 = paddle.linalg.norm(x_2, p=1)
-        out_2.retain_grads()
-        out_2.backward()
-
-        self.assertEqual(out_2.shape, [])
-        self.assertEqual(x_2.grad.shape, [24])
-
-        # 1D input, p = 1 ,axis = 0,
-        # using p_norm, as_vector = False
-        x_2_p = paddle.arange(24, dtype="float32") - 12
-        x_2_p.stop_gradient = False
-        out_2_p = paddle.linalg.norm(x_2_p, p=1, axis=0)
-        out_2_p.retain_grads()
-        out_2_p.backward()
-
-        self.assertEqual(out_2_p.shape, [])
-        self.assertEqual(x_2_p.grad.shape, [24])
-
-        # 1D input, p = fro ,axis = 0,
-        # using p_norm, as_vector = False
-        x_2_fro = paddle.arange(24, dtype="float32") - 12
-        x_2_fro.stop_gradient = False
-        out_2_fro = paddle.linalg.norm(x_2_fro, p="fro", axis=0)
-        out_2_fro.retain_grads()
-        out_2_fro.backward()
-
-        self.assertEqual(out_2_fro.shape, [])
-        self.assertEqual(x_2_fro.grad.shape, [24])
-
-        # 2D input, p = 1, axis = [0, 1]
-        # using p_matrix_norm ,depends on paddle.sum
-        x_3 = paddle.arange(24, dtype="float32").reshape([4, 6])
-        x_3.stop_gradient = False
-        out_3 = paddle.linalg.norm(x_3, p=1, axis=[0, 1])
-        out_3.retain_grads()
-        out_3.backward()
-        self.assertEqual(out_3.shape, [])
-        self.assertEqual(x_3.grad.shape, [4, 6])
-
-        # 2D input, p = 1, axis = None
-        # using p_matrix_norm, depends on paddle.sum
-        x_4 = paddle.arange(24, dtype="float32").reshape([4, 6])
-        x_4.stop_gradient = False
-        out_4 = paddle.linalg.norm(x_4)
-        out_4.retain_grads()
-        out_4.backward()
-        self.assertEqual(out_4.shape, [])
-        self.assertEqual(x_4.grad.shape, [4, 6])
-
-        # 2D input, p = inf, axis = [0, 1]
-        # using p_matrix_norm, depends on paddle.sum
-        x_5 = paddle.arange(24, dtype="float32").reshape([4, 6])
-        x_5.stop_gradient = False
-        out_5 = paddle.linalg.norm(x_5, p=2, axis=[0, 1])
-        out_5.retain_grads()
-        out_5.backward()
-
-        self.assertEqual(out_5.shape, [])
-        self.assertEqual(x_5.grad.shape, [4, 6])
-
-        # 2D input, p = -inf, axis = [0, 1]
-        x_6 = paddle.arange(24, dtype="float32").reshape([4, 6])
-        x_6.stop_gradient = False
-        out_6 = paddle.linalg.norm(x_6, p=-float("inf"), axis=[0, 1])
-        out_6.retain_grads()
-        out_6.backward()
-
-        self.assertEqual(out_6.shape, [])
-        self.assertEqual(x_6.grad.shape, [4, 6])
-
-    def test_linalg_cond(self):
-        def assert_shape(out):
-            self.assertEqual(out.shape, [])
-
-        x1 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x1.stop_gradient = False
-        # p = 2 : use paddle.sum
-        out = paddle.linalg.cond(x1)
-        out.backward()
-        assert_shape(out)
-        self.assertEqual(x1.grad.shape, [3, 3])
-
-        # p = fro : use paddle.sum
-        x2 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x2.stop_gradient = False
-        out_fro = paddle.linalg.cond(x2, p='fro')
-        out_fro.backward()
-        assert_shape(out_fro)
-        self.assertEqual(x2.grad.shape, [3, 3])
-
-        # p = nuc : use paddle.sum
-        x3 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x3.stop_gradient = False
-        out_nuc = paddle.linalg.cond(x3, p='nuc')
-        out_nuc.backward()
-        assert_shape(out_nuc)
-        self.assertEqual(x3.grad.shape, [3, 3])
-
-        # p in (-1, 1) : use paddle.sum
-        x4 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x4.stop_gradient = False
-        out_1 = paddle.linalg.cond(x4, p=1)
-        out_1.backward()
-        assert_shape(out_1)
-        self.assertEqual(x4.grad.shape, [3, 3])
-
-        x5 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x5.stop_gradient = False
-        out_minus_1 = paddle.linalg.cond(x5, p=-1)
-        out_minus_1.backward()
-        assert_shape(out_minus_1)
-        self.assertEqual(x5.grad.shape, [3, 3])
-
-        # p in (-2, 2)  depends on paddle.sum
-        x6 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x6.stop_gradient = False
-        out_2 = paddle.linalg.cond(x6, p=2)
-        out_2.backward()
-        assert_shape(out_2)
-        self.assertEqual(x6.grad.shape, [3, 3])
-
-        # p in (-inf, inf):use paddle.sum
-        x8 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x8.stop_gradient = False
-        out_inf = paddle.linalg.cond(x8, p=float("inf"))
-        out_inf.backward()
-        assert_shape(out_inf)
-        self.assertEqual(x8.grad.shape, [3, 3])
-
-        a = paddle.randn([2, 4, 4])
-        a.stop_gradient = False
-        a_cond_fro = paddle.linalg.cond(a, p='fro')
-        a_cond_fro.backward()
-        self.assertEqual(len(a_cond_fro.shape), 1)
-        self.assertEqual(a.grad.shape, [2, 4, 4])
-
-    def test_trace(self):
-        x = paddle.to_tensor([[3, 2], [1, 9]], dtype="float32")
-        x.stop_gradient = False
-        out = paddle.trace(x)
-        out.backward()
-
-        self.assertEqual(out.shape, [])
-        np.testing.assert_allclose(out, np.array(12))
-        self.assertEqual(x.grad.shape, [2, 2])
-
-
-class TestSundryAPIStatic(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-        self.exe = paddle.static.Executor()
-
-    def assertShapeEqual(self, out, target_tuple):
-        if not paddle.framework.in_pir_mode():
-            out_shape = list(out.shape)
-        else:
-            out_shape = out.shape
-        self.assertEqual(out_shape, target_tuple)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_polygamma(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.polygamma(x, 2)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x])
-        x_grad = grad_list[0][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_frexp(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out1, out2 = paddle.frexp(x)
-        grad_list = paddle.static.append_backward(out1, parameter_list=[x])
-        x_grad = grad_list[0][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out1, out2, x_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_pairwise_distance(self):
-        x = paddle.rand([5])
-        x.stop_gradient = False
-        y = paddle.rand([5])
-        y.stop_gradient = False
-
-        out = paddle.nn.functional.pairwise_distance(x, y)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, y])
-        x_grad, y_grad = (_grad for _param, _grad in grad_list)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad, y_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (5,))
-        self.assertEqual(res[2].shape, (5,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_take(self):
-        x1 = paddle.rand([4, 5])
-        x1.stop_gradient = False
-        out1 = paddle.take(x1, paddle.to_tensor(2))
-        x1_grad = paddle.static.append_backward(out1, parameter_list=[x1])
-        x1_grad = x1_grad[0][1]
-
-        x2 = paddle.rand([])
-        x2.stop_gradient = False
-        out2 = paddle.take(x2, paddle.to_tensor(0))
-        x2_grad = paddle.static.append_backward(out2, parameter_list=[x2])
-        x2_grad = x2_grad[0][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out1, x1_grad, out2, x2_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (4, 5))
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        np.testing.assert_allclose(res[3], 1.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_trapezoid(self):
-        y = paddle.rand([5])
-        y.stop_gradient = False
-        out = paddle.trapezoid(y, dx=2.0)
-        grad_list = paddle.static.append_backward(out, parameter_list=[y])
-        y_grad = grad_list[0][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, y_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (5,))
-
-    @prog_scope()
-    def test_create_parameter_var(self):
-        zero_dim_param = paddle.create_parameter(shape=[], dtype='float32')
-        self.assertShapeEqual(zero_dim_param, [])
-        prog = paddle.static.default_startup_program()
-        res = self.exe.run(prog, fetch_list=[zero_dim_param])
-        self.assertEqual(res[0].shape, ())
-
-        zero_dim_var = paddle.static.create_global_var(
-            shape=[], value=0.5, dtype='float32'
-        )
-        self.assertEqual(zero_dim_var.shape, ())
-        prog = paddle.static.default_startup_program()
-        res = self.exe.run(prog, fetch_list=[zero_dim_var])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 0.5)
-
-    @prog_scope()
-    def test_getitem(self):
-        # case1: When all axis have a scalar indice, output should be a 0-d Tensor;
-        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
-        x.stop_gradient = False
-        out = x[1, 2, 3, 4]
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        x_out_grad = [_grad for _param, _grad in grad_list]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + x_out_grad)
-
-        self.assertEqual(res[0].shape, ())
-        np.testing.assert_allclose(res[0], np.array(119))
-        self.assertEqual(res[2].shape, ())
-        np.testing.assert_allclose(res[2], 1.0)
-        self.assertEqual(res[1].shape, (2, 3, 4, 5))
-        x_grad_expected = np.zeros((2, 3, 4, 5))
-        x_grad_expected[1, 2, 3, 4] = 1.0
-        np.testing.assert_allclose(res[1], x_grad_expected)
-
-        # case2: When one axis has a 0-d Tensor indice, the output should be same as int indice.
-        x2 = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
-        out1 = x2[1, 2]
-        out2 = x2[
-            paddle.full([], 1, dtype='int32'), paddle.full([], 2, dtype='int32')
-        ]
-        res = self.exe.run(prog, fetch_list=[out1, out2])
-        np.testing.assert_allclose(res[0], res[1])
-
-        # case3: When all axis have a scalar indice (i.e. case1) and has None indice,
-        # ndim of output should be same with numbers of None.
-        x3 = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
-        out3 = x3[1, 2, None, 3, 4]
-        out4 = x3[1, None, 2, None, 3, 4]
-        res = self.exe.run(prog, fetch_list=[out3, out4])
-        self.assertEqual(res[0].shape, (1,))
-        np.testing.assert_allclose(res[0], np.array([119]))
-        self.assertEqual(res[1].shape, (1, 1))
-        np.testing.assert_allclose(res[1], np.array([[119]]))
-
-        # case4: 1-D Tensor will be treated as vector, no axis decrease will happen.
-        x4 = paddle.ones((2, 3, 4))
-        indice = paddle.ones([1], dtype='int32')
-        out5 = x4[indice]
-        out6 = x4[indice, indice]
-        res = self.exe.run(prog, fetch_list=[out5, out6])
-
-        self.assertEqual(res[0].shape, (1, 3, 4))
-        np.testing.assert_allclose(res[0], np.ones((1, 3, 4)))
-        self.assertEqual(res[1].shape, (1, 4))
-        np.testing.assert_allclose(res[1], np.ones((1, 4)))
-
-    @prog_scope()
-    def test_setitem(self):
-        # NOTE(zoooo0820): __setitem__ has gradient problem in static graph.
-        # To solve this, we may not support __setitem__ in static graph.
-        # These unit tests will delete soon.
-
-        # case1: all axis have a scalar indice
-        x = paddle.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))
-        x.stop_gradient = False
-        out = x * 2
-        out = paddle.static.setitem(out, (1, 2, 3, 4), 10)
-        paddle.static.append_backward(out.sum())
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x.grad_name])
-
-        self.assertEqual(out.shape, x.shape)
-        np.testing.assert_allclose(res[0][1, 2, 3, 4], np.array(10))
-        self.assertEqual(res[1].shape, (2, 3, 4, 5))
-        x_grad_expected = np.ones((2, 3, 4, 5)) * 2
-        x_grad_expected[1, 2, 3, 4] = 0
-        np.testing.assert_allclose(res[1], x_grad_expected)
-
-        # case2: 0-D Tensor indice in some axis
-        # NOTE(zoooo0820): Now, int/slice with 0-D Tensor will still be
-        # treated as combined indexing, which is not support backward.
-        # There should have more test cases such as out[1, indice, :] = 0.5 when this
-        # problem is fixed.
-        x = paddle.randn((2, 3, 4, 5))
-        x.stop_gradient = False
-        indice = paddle.full([], 1, dtype='int32')
-        out = x * 1
-        out = paddle.static.setitem(out, (indice, indice), 0.5)
-        paddle.static.append_backward(out.sum())
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x.grad_name])
-
-        self.assertEqual(out.shape, x.shape)
-        np.testing.assert_allclose(res[0][1, 1], np.ones((4, 5)) * 0.5)
-        x_grad_expected = np.ones((2, 3, 4, 5))
-        x_grad_expected[1, 1] = 0
-        np.testing.assert_allclose(res[1], x_grad_expected)
-
-        # case3：0-D Tensor indice in some axis, value is a Tensor
-        # and there is broadcast
-        x = paddle.randn((2, 3, 4, 5))
-        x.stop_gradient = False
-        v = paddle.ones((4, 5), dtype='float32') * 5
-        v.stop_gradient = False
-        indice = paddle.full([], 1, dtype='int32')
-        out = x * 1
-        out = paddle.static.setitem(out, indice, v)
-        paddle.static.append_backward(out.sum())
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x.grad_name, v.grad_name])
-
-        self.assertEqual(out.shape, x.shape)
-        np.testing.assert_allclose(res[0][1], np.ones((3, 4, 5)) * 5)
-        x_grad_expected = np.ones((2, 3, 4, 5))
-        x_grad_expected[1] = 0
-        np.testing.assert_allclose(res[1], x_grad_expected)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_expand(self):
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-        out = paddle.expand(x, shape=[1])
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1.0)
-        self.assertEqual(res[3].shape, (1,))
-        self.assertEqual(res[3], 1.0)
-
-        x1 = paddle.full([], 1, 'float32')
-        x1.stop_gradient = False
-        out1 = paddle.expand(x1, shape=[])
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x1, out1] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1.0)
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1.0)
-
-        x2 = paddle.full([], 1, 'float32')
-        x2.stop_gradient = False
-        out2 = paddle.expand(x2, shape=[3, 3])
-        grad_list = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2, out2]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x2, out2] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, (3, 3))
-        self.assertEqual(res[1].any(), 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 9)
-        self.assertEqual(res[3].shape, (3, 3))
-        self.assertEqual(res[3].any(), 1.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_expand_as(self):
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-        y = paddle.full([], 1, 'float32')
-        y.stop_gradient = False
-        out = paddle.expand_as(x, y)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1.0)
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1.0)
-
-        x1 = paddle.full([], 1, 'float32')
-        x1.stop_gradient = False
-        y1 = paddle.full([1], 1, 'float32')
-        y1.stop_gradient = False
-        out1 = paddle.expand_as(x1, y1)
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x1, out1] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1.0)
-        self.assertEqual(res[3].shape, (1,))
-        self.assertEqual(res[3], 1.0)
-
-        x2 = paddle.full([], 1, 'float32')
-        x2.stop_gradient = False
-        y2 = paddle.full([3, 3], 1, 'float32')
-        y2.stop_gradient = False
-        out2 = paddle.expand_as(x2, y2)
-        grad_list = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2, out2]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x2, out2] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, (3, 3))
-        self.assertEqual(res[1].any(), 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 9)
-        self.assertEqual(res[3].shape, (3, 3))
-        self.assertEqual(res[3].any(), 1.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_top_k(self):
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-        out, indices = paddle.topk(x, k=1, axis=0)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out, indices] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 0.0)
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1.0)
-        self.assertEqual(res[4].shape, ())
-        self.assertEqual(res[4], 1.0)
-
-        x1 = paddle.full([], 1, 'float32')
-        x1.stop_gradient = False
-        out1, indices1 = paddle.topk(x1, k=1, axis=-1)
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x1, out1, indices1] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 0.0)
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1.0)
-        self.assertEqual(res[4].shape, ())
-        self.assertEqual(res[4], 1.0)
-
-        with self.assertRaises(ValueError):
-            tmp = paddle.topk(x1, k=1, axis=2)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_broadcast_to(self):
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-        out = paddle.broadcast_to(x, shape=[1])
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1.0)
-        self.assertEqual(res[3].shape, (1,))
-        self.assertEqual(res[3], 1.0)
-
-        x1 = paddle.full([], 1, 'float32')
-        x1.stop_gradient = False
-        out1 = paddle.broadcast_to(x1, shape=[])
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x1, out1] + grad_list)
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1.0)
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1.0)
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_argmin(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        out1 = paddle.argmin(x, 0)
-        out2 = paddle.argmin(x, -1)
-        out3 = paddle.argmin(x, None)
-
-        # 2) x is ND
-        x4 = paddle.rand([3, 5])
-        out4 = paddle.argmin(x, None)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                out3,
-                out4,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        np.testing.assert_allclose(res[0], 0.0)
-        self.assertEqual(res[1].shape, ())
-        np.testing.assert_allclose(res[1], 0.0)
-        self.assertEqual(res[2].shape, ())
-        np.testing.assert_allclose(res[2], 0.0)
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_argmax(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        out1 = paddle.argmax(x, 0)
-        out2 = paddle.argmax(x, -1)
-        out3 = paddle.argmax(x, None)
-
-        # 2) x is ND
-        x4 = paddle.rand([3, 5])
-        out4 = paddle.argmax(x, None)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                out3,
-                out4,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        np.testing.assert_allclose(res[0], 0.0)
-        self.assertEqual(res[1].shape, ())
-        np.testing.assert_allclose(res[1], 0.0)
-        self.assertEqual(res[2].shape, ())
-        np.testing.assert_allclose(res[2], 0.0)
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_kthvalue(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out, index = paddle.kthvalue(x, 1)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out, index] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertTrue(res[1] == res[0])
-        self.assertEqual(res[2].shape, ())
-        self.assertTrue(res[2] == 0)
-
-        self.assertEqual(res[3].shape, ())
-        self.assertTrue(res[3] == 1.0)
-
-        # 2) x is 1D
-        x1 = paddle.rand([5])
-        x1.stop_gradient = False
-        out1, index1 = paddle.kthvalue(x1, 1)
-        grad_list = paddle.static.append_backward(out1, parameter_list=[x1])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out1, index1] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (5,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_mode(self):
-        # 1) x is 0D
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out, index = paddle.mode(x)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, index] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertTrue(res[2] == 1.0)
-
-        # 2) x is 1D
-        x1 = paddle.rand([5])
-        x1.stop_gradient = False
-        out1, index1 = paddle.mode(x1)
-        grad_list = paddle.static.append_backward(out1, parameter_list=[x1])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out1, index1] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (5,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_is_empty(self):
-        # 1) x is 0D
-        x1 = paddle.rand([])
-        out1 = paddle.is_empty(x1)
-
-        # 2) x is 1D
-        x2 = paddle.rand([5])
-        out2 = paddle.is_empty(x2)
-
-        # 3) x is ND
-        x3 = paddle.rand([3, 5])
-        out3 = paddle.is_empty(x3)
-
-        x4 = paddle.rand([3, 0, 5])
-        out4 = paddle.is_empty(x4)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[out1, out2, out3, out4],
-        )
-
-        self.assertEqual(res[0].shape, ())
-        self.assertFalse(bool(res[0]))
-        self.assertEqual(res[1].shape, ())
-        self.assertFalse(bool(res[1]))
-        self.assertEqual(res[2].shape, ())
-        self.assertFalse(bool(res[2]))
-        self.assertEqual(res[3].shape, ())
-        self.assertTrue(bool(res[3]))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_as_complex(self):
-        x = paddle.rand([2])
-        x.stop_gradient = False
-        out = paddle.as_complex(x)
-        self.assertShapeEqual(
-            x,
-            [
-                2,
-            ],
-        )
-        self.assertShapeEqual(out, [])
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[x, out] + grad_list,
-        )
-
-        self.assertEqual(res[0].shape, (2,))
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (2,))
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_dot(self):
-        # 1) x is 1d
-        x = paddle.rand([2])
-        x.stop_gradient = False
-        y = paddle.rand([2])
-        y.stop_gradient = False
-        out = paddle.dot(x, y)
-
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        x_grad = grad_list[0][1]
-        out_grad = grad_list[1][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[x, x_grad, out, out_grad],
-        )
-
-        self.assertEqual(res[0].shape, (2,))
-        self.assertEqual(res[1].shape, (2,))
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-        # 2) x is 2D
-        x1 = paddle.rand([2, 2])
-        x1.stop_gradient = False
-        y1 = paddle.rand([2, 2])
-        y1.stop_gradient = False
-        out1 = paddle.dot(x1, y1)
-
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        x1_grad = grad_list[0][1]
-        out1_grad = grad_list[1][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[x1, x1_grad, out1, out1_grad],
-        )
-
-        self.assertEqual(res[0].shape, (2, 2))
-        self.assertEqual(res[1].shape, (2, 2))
-        self.assertEqual(res[2].shape, (2,))
-        self.assertEqual(res[3].shape, (2,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_inner(self):
-        # 1) input is 1D
-        x1 = paddle.rand([2])
-        x1.stop_gradient = False
-        y1 = paddle.rand([2])
-        y1.stop_gradient = False
-        out1 = paddle.inner(x1, y1)
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        x1_grad = grad_list[0][1]
-        out1_grad = grad_list[1][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                x1,
-                x1_grad,
-                out1,
-                out1_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, (2,))
-        self.assertEqual(res[1].shape, (2,))
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-        # 2) input is 2D
-        x = paddle.rand([2, 3])
-        x.stop_gradient = False
-        y = paddle.rand([2, 3])
-        y.stop_gradient = False
-        out = paddle.inner(x, y)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        x_grad = grad_list[0][1]
-        out_grad = grad_list[1][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                x,
-                x_grad,
-                out,
-                out_grad,
-            ],
-        )
-
-        self.assertEqual(res[0].shape, (2, 3))
-        self.assertEqual(res[1].shape, (2, 3))
-        self.assertEqual(res[2].shape, (2, 2))
-        self.assertEqual(res[3].shape, (2, 2))
-
-    @prog_scope()
-    def test_tensordot(self):
-        x = paddle.full(shape=[10], fill_value=0.25, dtype='float64')
-        x.stop_gradient = False
-        y = paddle.full(shape=[10], fill_value=0.25, dtype='float64')
-        y.stop_gradient = False
-        out = paddle.tensordot(x, y, axes=1)
-
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        x_grad = grad_list[0][1]
-        out_grad = grad_list[1][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[x, x_grad, out, out_grad],
-        )
-
-        self.assertEqual(res[0].shape, (10,))
-        self.assertEqual(res[1].shape, (10,))
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-        x = paddle.arange(6, dtype='float64').reshape([2, 3])
-        y = paddle.arange(6, dtype='float64').reshape([2, 3])
-        x.stop_gradient = False
-        out = paddle.tensordot(x, y, axes=2)
-
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        x_grad = grad_list[0][1]
-        out_grad = grad_list[1][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[x, x_grad, out, out_grad],
-        )
-
-        self.assertEqual(res[0].shape, (2, 3))
-        self.assertEqual(res[1].shape, (2, 3))
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_metric_accuracy(self):
-        x = paddle.full(shape=[2, 4], fill_value=0.25)
-        y = paddle.full(shape=[2, 1], fill_value=1, dtype="int64")
-        out = paddle.metric.accuracy(input=x, label=y, k=1)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[out],
-        )
-
-        self.assertEqual(res[0].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_static_accuracy(self):
-        x = paddle.full(shape=[2, 4], fill_value=0.25)
-        y = paddle.full(shape=[2, 1], fill_value=1, dtype="int64")
-        out = paddle.static.accuracy(input=x, label=y, k=1)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[out],
-        )
-
-        self.assertEqual(res[0].shape, ())
-
-    @prog_scope()
-    def test_static_auc(self):
-        x = paddle.full(shape=[3, 2], fill_value=0.25)
-        y = paddle.full(shape=[3], fill_value=1, dtype="int64")
-        out = paddle.static.auc(input=x, label=y)[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[out],
-        )
-
-        self.assertEqual(res[0].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_std(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out1 = paddle.std(x)
-        out2 = paddle.std(x, [])
-        grad_list = paddle.static.append_backward(
-            out1, parameter_list=[x, out1]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                x,
-                out1,
-                out2,
-            ]
-            + grad_list,
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[4].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_var(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out1 = paddle.var(x)
-        out2 = paddle.var(x, [])
-        grad_list = paddle.static.append_backward(
-            out1, parameter_list=[x, out1]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                x,
-                out1,
-                out2,
-            ]
-            + grad_list,
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[4].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_quantile(self):
-        x1 = paddle.rand([])
-        x1.stop_gradient = False
-        out1 = paddle.quantile(x1, 0.5, axis=None)
-        grad_list1 = paddle.static.append_backward(
-            out1, parameter_list=[x1, out1]
-        )
-        grad_list1 = [_grad for _param, _grad in grad_list1]
-
-        x2 = paddle.rand([2, 3])
-        x2.stop_gradient = False
-        out2 = paddle.quantile(x2, 0.5, axis=None)
-        grad_list2 = paddle.static.append_backward(
-            out2, parameter_list=[x2, out2]
-        )
-        grad_list2 = [_grad for _param, _grad in grad_list2]
-
-        out_empty_list = paddle.quantile(x1, 0.5, axis=[])
-        self.assertShapeEqual(out_empty_list, [])
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-            ]
-            + grad_list1
-            + grad_list2,
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1.0)
-
-        self.assertEqual(res[4].shape, (2, 3))
-        self.assertEqual(res[5].shape, ())
-        self.assertEqual(res[5], 1.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_nanquantile(self):
-        # 1) x is 0D
-        x1 = paddle.rand([])
-        x1.stop_gradient = False
-        out1 = paddle.nanquantile(x1, 0.5, axis=None)
-        grad_list = paddle.static.append_backward(out1, parameter_list=[x1])
-        x1_grad = grad_list[0][1]
-
-        # 2) x is ND with 'nan'
-        x2 = paddle.to_tensor([[float('nan'), 2.0, 3.0], [0.0, 1.0, 2.0]])
-        x2.stop_gradient = False
-        out2 = paddle.nanquantile(x2, 0.5, axis=None)
-        print(out2)
-        grad_list = paddle.static.append_backward(out2, parameter_list=[x2])
-        x2_grad = grad_list[0][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                x1_grad,
-                out2,
-                x2_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, (2, 3))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_flip(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.flip(x, axis=[])
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_equal_scalar(self):
-        x = paddle.rand([])
-        out = paddle.equal(x, 2.0)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], False)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_pow_scalar(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.pow(x, 2.0)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_cast(self):
-        x = paddle.full([], 1.0, 'float32')
-        x.stop_gradient = False
-        out = paddle.cast(x, 'int32')
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_cumprod(self):
-        x = paddle.full([], 1.0, 'float32')
-        x.stop_gradient = False
-        out = paddle.cumprod(x, 0)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-
-        with self.assertRaises(ValueError):
-            tmp = paddle.cumprod(x, 2)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_clip(self):
-        x = paddle.uniform([], None, -10, 10)
-        x.stop_gradient = False
-        out = paddle.clip(x, -5, 5)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        x_grad, out_grad = (_grad for _param, _grad in grad_list)
-
-        x1 = paddle.uniform([], None, -10, 10)
-        x1.stop_gradient = False
-        out1 = paddle.clip(x1, paddle.full([], -5.0), paddle.full([], 5.0))
-        grad_list = paddle.static.append_backward(
-            out1, parameter_list=[x1, out1]
-        )
-        x1_grad, out1_grad = (_grad for _param, _grad in grad_list)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                x,
-                out,
-                x_grad,
-                out_grad,
-                x1,
-                out1,
-                x1_grad,
-                out1_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[4].shape, ())
-        self.assertEqual(res[5].shape, ())
-        self.assertEqual(res[6].shape, ())
-        self.assertEqual(res[7].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_increment(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.increment(x, 1.0)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-
-        prog = paddle.static.default_main_program()
-        if paddle.framework.in_pir_mode():
-            grad_list = [_grad for _param, _grad in grad_list if _grad]
-            res = self.exe.run(prog, fetch_list=[x, out] + grad_list)
-            self.assertEqual(res[0].shape, ())
-            self.assertEqual(res[1].shape, ())
-            if len(grad_list) > 0:
-                self.assertEqual(res[2].shape, ())
-            if len(grad_list) > 1:
-                self.assertEqual(res[3].shape, ())
-        else:
-            res = self.exe.run(
-                prog, fetch_list=[x, out, x.grad_name, out.grad_name]
-            )
-            self.assertEqual(res[0].shape, ())
-            self.assertEqual(res[1].shape, ())
-            self.assertEqual(res[2].shape, ())
-            self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_bitwise_not(self):
-        # have no backward
-        x = paddle.randint(-1, 1, [])
-        out = paddle.bitwise_not(x)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_logical_not(self):
-        # have no backward
-        x = paddle.randint(0, 1, [])
-        out = paddle.logical_not(x)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_searchsorted(self):
-        # have no backward
-        x = paddle.full([10], 1.0, 'float32')
-        y = paddle.full([], 1.0, 'float32')
-        out = paddle.searchsorted(x, y)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_transpose(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.transpose(x, [])
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1.0)
-
-        with self.assertRaises(ValueError):
-            x = paddle.transpose(x, [0])
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_moveaxis(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.moveaxis(x, [], [])
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[1], 1.0)
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1.0)
-
-        with self.assertRaises(AssertionError):
-            x = paddle.moveaxis(x, [0], [1])
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_gather_1D(self):
-        x = paddle.full([10], 1.0, 'float32')
-        x.stop_gradient = False
-        index = paddle.full([], 2, 'int64')
-        out = paddle.gather(x, index)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1)
-        self.assertEqual(res[1].shape, (10,))
-        self.assertEqual(res[2].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_gather_XD_axis_0(self):
-        x = paddle.full([2, 3], 1.0, 'float32')
-        x.stop_gradient = False
-        index = paddle.full([], 1, 'int64')
-        out = paddle.gather(x, index)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, (3,))
-        np.testing.assert_array_equal(res[0], [1.0, 1.0, 1.0])
-        self.assertEqual(res[1].shape, (2, 3))
-        self.assertEqual(res[2].shape, (3,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_gather_XD_axis_1(self):
-        x = paddle.full([2, 3], 1.0, 'float32')
-        x.stop_gradient = False
-        index = paddle.full([], 1, 'int64')
-        out = paddle.gather(x, index, axis=1)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, (2,))
-        np.testing.assert_array_equal(res[0], [1.0, 1.0])
-        self.assertEqual(res[1].shape, (2, 3))
-        self.assertEqual(res[2].shape, (2,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_gather_nd(self):
-        x1 = paddle.full([10], 1.0, 'float32')
-        x1.stop_gradient = False
-        x2 = paddle.full([2, 3], 1.0, 'float32')
-        x2.stop_gradient = False
-
-        index1 = paddle.full([1], 1, 'int64')
-        index2 = paddle.full([2], 1, 'int64')
-
-        out1 = paddle.gather_nd(x1, index1)
-        out2 = paddle.gather_nd(x2, index2)
-        grad_list1 = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        grad_list2 = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2, out2]
-        )
-
-        (_, x1_grad), (_, out1_grad) = grad_list1
-        (_, x2_grad), (_, out2_grad) = grad_list2
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                x1_grad,
-                x2_grad,
-                out1_grad,
-                out2_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        np.testing.assert_array_equal(res[0], 1.0)
-        np.testing.assert_array_equal(res[1], 1.0)
-        self.assertEqual(res[2].shape, (10,))
-        self.assertEqual(res[3].shape, (2, 3))
-        self.assertEqual(res[4].shape, ())
-        self.assertEqual(res[5].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_scatter_1D(self):
-        x = paddle.full([10], 1.0, 'float32')
-        x.stop_gradient = False
-        index = paddle.full([], 2, 'int64')
-        updates = paddle.full([], 4, 'float32')
-        out = paddle.scatter(x, index, updates)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, (10,))
-        self.assertEqual(res[0][2], 4.0)
-        self.assertEqual(res[1].shape, (10,))
-        self.assertEqual(res[2].shape, (10,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_scatter_XD(self):
-        x = paddle.full([2, 3], 1.0, 'float32')
-        x.stop_gradient = False
-        index = paddle.full([], 1, 'int64')
-        updates = paddle.full([3], 4, 'float32')
-        out = paddle.scatter(x, index, updates)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, (2, 3))
-        np.testing.assert_array_equal(res[0][1], [4.0, 4.0, 4.0])
-        self.assertEqual(res[1].shape, (2, 3))
-        self.assertEqual(res[2].shape, (2, 3))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_diagflat(self):
-        # have no backward
-        x1 = paddle.rand([])
-        out1 = paddle.diagflat(x1, 1)
-
-        x2 = paddle.rand([])
-        out2 = paddle.diagflat(x2, -1)
-
-        x3 = paddle.rand([])
-        out3 = paddle.diagflat(x3)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out1, out2, out3])
-        self.assertEqual(res[0].shape, (2, 2))
-        self.assertEqual(res[1].shape, (2, 2))
-        self.assertEqual(res[2].shape, (1, 1))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_scatter__1D(self):
-        x = paddle.full([10], 1.0, 'float32')
-        index = paddle.full([], 2, 'int64')
-        updates = paddle.full([], 4, 'float32')
-        out = paddle.scatter_(x, index, updates)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0][2], 4)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_scatter__XD(self):
-        x = paddle.full([2, 3], 1.0, 'float32')
-        index = paddle.full([], 1, 'int64')
-        updates = paddle.full([3], 4, 'float32')
-        out = paddle.scatter_(x, index, updates)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        np.testing.assert_array_equal(res[0][1], [4.0, 4.0, 4.0])
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_scatter_nd(self):
-        index = paddle.full([1], 3, dtype='int64')
-        updates = paddle.full([], 2, 'float32')
-        updates.stop_gradient = False
-        out = paddle.scatter_nd(index, updates, [5])
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[out, updates]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, (5,))
-        self.assertEqual(res[0][3], 2)
-        self.assertEqual(res[1].shape, (5,))
-        self.assertEqual(res[2].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_flatten(self):
-        x = paddle.full([], 1, 'float32')
-        x.stop_gradient = False
-
-        start_axis = 0
-        stop_axis = -1
-
-        out = paddle.flatten(x, start_axis=start_axis, stop_axis=stop_axis)
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[x, out]
-        )
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={}, fetch_list=[out] + grad_list)
-
-        self.assertEqual(res[0].shape, (1,))
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (1,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_histogram(self):
-        x = paddle.full([], 1, 'float32')
-        out = paddle.histogram(x, bins=5, min=1, max=5)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={}, fetch_list=[out])
-
-        self.assertEqual(res[0].shape, (5,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_scale(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        out = paddle.scale(x, scale=2.0, bias=1.0)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        grad_list = [_grad for _param, _grad in grad_list]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out] + grad_list)
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_floor_divide(self):
-        # 1-d // 0-d
-        x = paddle.to_tensor([1, -2, 3], dtype="int64")
-        y = paddle.full([], 2, dtype='int64')
-        out1_1 = paddle.floor_divide(x, y)
-        out1_2 = x // y
-
-        # 0-d // 1-d
-        out2_1 = paddle.floor_divide(y, x)
-        out2_2 = y // x
-
-        # 0-d // 0-d
-        x = paddle.full([], 3, dtype='int64')
-        out3_1 = paddle.floor_divide(x, y)
-        out3_2 = x // y
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog, fetch_list=[out1_1, out1_2, out2_1, out2_2, out3_1, out3_2]
-        )
-        out1_1, out1_2, out2_1, out2_2, out3_1, out3_2 = res
-
-        np.testing.assert_array_equal(out1_1, out1_2)
-        np.testing.assert_array_equal(out1_1, np.asarray([0, -1, 1]))
-        np.testing.assert_array_equal(out2_1, out2_2)
-        np.testing.assert_array_equal(out2_2, np.asarray([2, -1, 0]))
-        np.testing.assert_array_equal(out3_1, out3_2)
-        np.testing.assert_array_equal(out3_2, np.asarray(1))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_cumsum(self):
-        x1 = paddle.rand([])
-        x1.stop_gradient = False
-
-        out1 = paddle.cumsum(x1)
-        out2 = paddle.cumsum(x1, axis=0)
-        out3 = paddle.cumsum(x1, axis=-1)
-
-        (_, x1_grad), (_, out1_grad) = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        (_, x1_grad), (_, out2_grad) = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x1, out2]
-        )
-        (_, x1_grad), (_, out3_grad) = paddle.static.append_backward(
-            out3.sum(), parameter_list=[x1, out3]
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                out3,
-                x1_grad,
-                out1_grad,
-                out2_grad,
-                out3_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, (1,))
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1.0)
-        self.assertEqual(res[4].shape, (1,))
-        self.assertEqual(res[4], 1.0)
-        self.assertEqual(res[5].shape, ())
-        self.assertEqual(res[5], 1.0)
-        self.assertEqual(res[6].shape, ())
-        self.assertEqual(res[6], 1.0)
-        self.assertShapeEqual(out2, [])
-        self.assertShapeEqual(out3, [])
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_logcumsumexp(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-
-        out1 = paddle.logcumsumexp(x)
-        out2 = paddle.logcumsumexp(x, axis=0)
-        out3 = paddle.logcumsumexp(x, axis=-1)
-
-        grad_list1 = paddle.static.append_backward(out1, parameter_list=[x])
-        grad_list2 = paddle.static.append_backward(out2, parameter_list=[x])
-        grad_list3 = paddle.static.append_backward(out3, parameter_list=[x])
-
-        x_grad = grad_list3[0][1]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                out3,
-                x_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, (1,))
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_add_n(self):
-        x1 = paddle.rand([])
-        x1.stop_gradient = False
-        x2 = paddle.rand([])
-        x2.stop_gradient = False
-        x3 = paddle.rand([])
-        x3.stop_gradient = False
-
-        out1 = paddle.add_n(x1)
-        out2 = paddle.add_n([x2, x3])
-
-        grad_list1 = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        grad_list23 = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2, x3, out2]
-        )
-
-        (_, x1_grad), (_, out1_grad) = grad_list1
-        (_, x2_grad), (_, x3_grad), (_, out2_grad) = grad_list23
-
-        prog = paddle.static.default_main_program()
-        block = prog.global_block()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                x1_grad,
-                x2_grad,
-                x3_grad,
-                out1_grad,
-                out2_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 1)
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1)
-        self.assertEqual(res[4].shape, ())
-        self.assertEqual(res[4], 1)
-        self.assertEqual(res[5].shape, ())
-        self.assertEqual(res[6].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_reshape_list(self):
-        x1 = paddle.rand([])
-        x2 = paddle.rand([])
-        x3 = paddle.rand([])
-        x4 = paddle.rand([])
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-        x3.stop_gradient = False
-        x4.stop_gradient = False
-
-        out1 = paddle.reshape(x1, [])
-        grad_list1 = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        (_, x1_grad), (_, out1_grad) = grad_list1
-
-        out2 = paddle.reshape(x2, [1])
-        grad_list2 = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2, out2]
-        )
-        (_, x2_grad), (_, out2_grad) = grad_list2
-
-        out3 = paddle.reshape(x3, [-1])
-        grad_list3 = paddle.static.append_backward(
-            out3.sum(), parameter_list=[x3, out3]
-        )
-        (_, x3_grad), (_, out3_grad) = grad_list3
-
-        out4 = paddle.reshape(x4, [-1, 1])
-        grad_list4 = paddle.static.append_backward(
-            out4.sum(), parameter_list=[x4, out4]
-        )
-        (_, x4_grad), (_, out4_grad) = grad_list4
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                out3,
-                out4,
-                x1_grad,
-                x2_grad,
-                x3_grad,
-                x4_grad,
-                out1_grad,
-                out2_grad,
-                out3_grad,
-                out4_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[2].shape, (1,))
-        self.assertEqual(res[3].shape, (1, 1))
-
-        self.assertEqual(res[4].shape, ())
-        self.assertEqual(res[5].shape, ())
-        self.assertEqual(res[6].shape, ())
-        self.assertEqual(res[7].shape, ())
-
-        self.assertEqual(res[8].shape, ())
-        self.assertEqual(res[9].shape, (1,))
-        self.assertEqual(res[10].shape, (1,))
-        self.assertEqual(res[11].shape, (1, 1))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_reshape_tensor(self):
-        x1 = paddle.rand([1, 1])
-        x1.stop_gradient = False
-        new_shape = paddle.full([3], 1, "int32")
-        out1 = paddle.reshape(x1, new_shape)
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        (_, x1_grad), (_, out1_grad) = grad_list
-
-        x2 = paddle.rand([1, 1])
-        x2.stop_gradient = False
-        new_shape = paddle.full([1], -1, "int32")
-        out2 = paddle.reshape(x2, new_shape)
-        grad_list = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2, out2]
-        )
-        (_, x2_grad), (_, out2_grad) = grad_list
-
-        x3 = paddle.rand([1, 1])
-        x3.stop_gradient = False
-        new_shape = [paddle.full([], -1, "int32"), paddle.full([], 1, "int32")]
-        out3 = paddle.reshape(x3, new_shape)
-        grad_list = paddle.static.append_backward(
-            out3.sum(), parameter_list=[x3, out3]
-        )
-        (_, x3_grad), (_, out3_grad) = grad_list
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                out3,
-                x1_grad,
-                x2_grad,
-                x3_grad,
-                out1_grad,
-                out2_grad,
-                out3_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, (1, 1, 1))
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[2].shape, (1, 1))
-
-        self.assertEqual(res[3].shape, (1, 1))
-        self.assertEqual(res[4].shape, (1, 1))
-        self.assertEqual(res[5].shape, (1, 1))
-
-        self.assertEqual(res[6].shape, (1, 1, 1))
-        self.assertEqual(res[7].shape, (1,))
-        self.assertEqual(res[8].shape, (1, 1))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_reverse(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-
-        out = paddle.reverse(x, axis=[])
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, out])
-        (_, x_grad), (out_grad) = grad_list
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, out, x_grad, out_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_sort(self):
-        x1 = paddle.rand([])
-        x1.stop_gradient = False
-        out1 = paddle.sort(x1, axis=-1)
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        (_, x1_grad), (_, out1_grad) = grad_list
-
-        x2 = paddle.rand([])
-        x2.stop_gradient = False
-        out2 = paddle.sort(x2, axis=0)
-        grad_list = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2, out2]
-        )
-        (_, x2_grad), (_, out2_grad) = grad_list
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                out1_grad,
-                out2_grad,
-                x1_grad,
-                x2_grad,
-            ],
-        )
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[4].shape, ())
-        self.assertEqual(res[5].shape, ())
-        self.assertEqual(res[4], 1.0)
-        self.assertEqual(res[5], 1.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_argsort(self):
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            # have no backward
-            x1 = paddle.rand([])
-            out1 = paddle.argsort(x1, axis=-1)
-
-            x2 = paddle.rand([])
-            x2.stop_gradient = False
-            out2 = paddle.argsort(x2, axis=0)
-
-            prog = paddle.static.default_main_program()
-            res = self.exe.run(prog, fetch_list=[out1, out2])
-
-            self.assertEqual(res[0].shape, ())
-            self.assertEqual(res[1].shape, ())
-            self.assertEqual(res[0], 0.0)
-            self.assertEqual(res[1], 0.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_lerp(self):
-        shapes = [
-            [(), (), (), ()],
-            [(), (64, 64), (), (64, 64)],
-            [(64, 64), (), (), (64, 64)],
-            [(64, 64), (), 0.5, (64, 64)],
-        ]
-        for shape in shapes:
-            x = paddle.rand(shape[0])
-            y = paddle.rand(shape[1])
-            if isinstance(shape[2], float):
-                w = shape[2]
-            else:
-                w = paddle.rand(shape[2])
-
-            x.stop_gradient = False
-            y.stop_gradient = False
-            out = paddle.lerp(x, y, w)
-            grad_list = paddle.static.append_backward(
-                out.sum(), parameter_list=[out, y, x]
-            )
-            (_, out_grad), (_, y_grad), (_, x_grad) = grad_list
-
-            prog = paddle.static.default_main_program()
-            res = self.exe.run(prog, fetch_list=[out, out_grad, y_grad, x_grad])
-            self.assertEqual(res[0].shape, shape[3])
-            self.assertEqual(res[1].shape, shape[3])
-            self.assertEqual(res[2].shape, shape[1])
-            self.assertEqual(res[3].shape, shape[0])
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_repeat_interleave(self):
-        x1 = paddle.full([], 1.0, 'float32')
-        x1.stop_gradient = False
-        out1 = paddle.repeat_interleave(x1, 2, None)
-        grad_list1 = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        (_, x1_grad), (_, out1_grad) = grad_list1
-
-        x2 = paddle.full([], 1.0, 'float32')
-        x2.stop_gradient = False
-        repeats = paddle.to_tensor([3], dtype='int32')
-        out2 = paddle.repeat_interleave(x2, repeats, None)
-        grad_list2 = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2, out2]
-        )
-        (_, x2_grad), (_, out2_grad) = grad_list2
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                x1_grad,
-                x2_grad,
-                out1_grad,
-                out2_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, (2,))
-        self.assertEqual(res[1].shape, (3,))
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[4].shape, (2,))
-        self.assertEqual(res[5].shape, (3,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_allclose(self):
-        # 1) x is 0D
-        x = paddle.full([], 0.5)
-        y = paddle.full([], 0.6)
-        out = paddle.allclose(x, y)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        self.assertFalse(res[0])
-
-        # 2) x is ND
-        x = paddle.full([2, 3], 0.5)
-        y = paddle.full([2, 3], 0.6)
-        out = paddle.allclose(x, y)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        self.assertFalse(res[0])
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_equal_all(self):
-        # 1) x is 0D
-        x = paddle.full([], 0.5)
-        y = paddle.full([], 0.6)
-        out = paddle.equal_all(x, y)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        self.assertFalse(res[0])
-
-        # 2) x is ND
-        x = paddle.full([2, 3], 0.5)
-        y = paddle.full([2, 3], 0.6)
-        out = paddle.equal_all(x, y)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        self.assertFalse(res[0])
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_where(self):
-        x1 = paddle.full([], 1, 'float32')
-        x2 = paddle.full([], 2, 'float32')
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-        out = paddle.where(x1 > x2, x1, x2)
-        loss = paddle.mean(out)
-        grad_list = paddle.static.append_backward(
-            loss, parameter_list=[out, x1, x2]
-        )
-        (_, out_grad), (_, x1_grad), (_, x2_grad) = grad_list
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            feed={},
-            fetch_list=[out, out_grad, x1_grad, x2_grad],
-        )
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 2)
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[2], 0)
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_atan2(self):
-        x1 = paddle.full([], 0, 'float32')
-        x2 = paddle.full([], 2, 'float32')
-        x1.stop_gradient = False
-        x2.stop_gradient = False
-        out = paddle.atan2(x1, x2)
-        paddle.static.append_backward(out)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={}, fetch_list=[out])
-
-        self.assertEqual(res[0].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_interpolate(self):
-        from paddle.nn.functional import interpolate
-
-        input_x = paddle.rand([2, 3, 6, 6])
-        input_x.stop_gradient = False
-
-        output_size = [
-            paddle.full([], 12, dtype="int32"),
-            paddle.full([], 12, dtype="int32"),
-        ]
-
-        out1 = interpolate(
-            x=input_x, size=output_size, mode="bilinear", align_corners=False
-        )
-        _, input_x_grad = paddle.static.append_backward(
-            out1.sum(), parameter_list=[input_x]
-        )[0]
-        prog = paddle.static.default_main_program()
-        res1 = self.exe.run(prog, feed={}, fetch_list=[out1, input_x_grad])
-
-        scale_1 = paddle.full([], 2)
-        out2 = interpolate(
-            x=input_x,
-            scale_factor=scale_1,
-            mode="bilinear",
-            align_corners=False,
-        )
-        _, input_x_grad = paddle.static.append_backward(
-            out2.sum(), parameter_list=[input_x]
-        )[0]
-        prog = paddle.static.default_main_program()
-        res2 = self.exe.run(prog, feed={}, fetch_list=[out2, input_x_grad])
-
-        self.assertEqual(res1[0].shape, (2, 3, 12, 12))
-        self.assertEqual(res1[1].shape, (2, 3, 6, 6))
-        self.assertEqual(res2[0].shape, (2, 3, 12, 12))
-        self.assertEqual(res2[1].shape, (2, 3, 6, 6))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_upsample(self):
-        from paddle.nn.functional import upsample
-
-        input_x = paddle.rand([2, 3, 6, 6])
-        input_x.stop_gradient = False
-
-        output_size = [
-            paddle.full([], 12, dtype="int32"),
-            paddle.full([], 12, dtype="int32"),
-        ]
-
-        out1 = upsample(
-            x=input_x, size=output_size, mode="bilinear", align_corners=False
-        )
-        _, input_x_grad = paddle.static.append_backward(
-            out1.sum(), parameter_list=[input_x]
-        )[0]
-        prog = paddle.static.default_main_program()
-        res1 = self.exe.run(prog, feed={}, fetch_list=[out1, input_x_grad])
-
-        self.assertEqual(res1[0].shape, (2, 3, 12, 12))
-        self.assertEqual(res1[1].shape, (2, 3, 6, 6))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_unstack(self):
-        x1 = paddle.full([1], 0, 'float32')
-        x1.stop_gradient = False
-        out1 = paddle.unstack(x1, 0)
-        out1 = paddle.add_n(out1)
-        _, x1_grad = paddle.static.append_backward(out1, parameter_list=[x1])[0]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={}, fetch_list=[out1, x1_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (1,))
-
-        x2 = paddle.full([2], 2, 'float32')
-        x2.stop_gradient = False
-        out2 = paddle.unstack(x2, 0)
-        out2_sum = paddle.add_n(out2)
-        _, x2_grad = paddle.static.append_backward(
-            out2_sum, parameter_list=[x2]
-        )[0]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={}, fetch_list=[out2_sum, x2_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_unbind(self):
-        x1 = paddle.full([1], 0, 'float32')
-        x1.stop_gradient = False
-        out1 = paddle.unbind(x1, 0)
-        out1 = paddle.add_n(out1)
-        _, x1_grad = paddle.static.append_backward(out1, parameter_list=[x1])[0]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={}, fetch_list=[out1, x1_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (1,))
-
-        x2 = paddle.full([2], 2, 'float32')
-        x2.stop_gradient = False
-        out2 = paddle.unbind(x2, 0)
-        out2_sum = paddle.add_n(out2)
-        _, x2_grad = paddle.static.append_backward(
-            out2_sum, parameter_list=[x2]
-        )[0]
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={}, fetch_list=[out2_sum, x2_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_masked_select(self):
-        x = paddle.rand([])
-        x.stop_gradient = False
-        mask = paddle.full([], True, dtype='bool')
-        y = paddle.masked_select(x, mask)
-        grad_list = paddle.static.append_backward(
-            y.sum(), parameter_list=[y, x]
-        )
-        (_, y_grad), (_, x_grad) = grad_list
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[x, y, y_grad, x_grad])
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[1], res[0])
-        self.assertEqual(res[2].shape, (1,))
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[3], 1)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_squeeze(self):
-        x1 = paddle.full([], 2)
-        x1.stop_gradient = False
-        out1 = paddle.squeeze(x1, axis=0)
-        _, x1_grad = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1]
-        )[0]
-
-        x2 = paddle.full([], 3)
-        x3 = paddle.full([], 0, dtype='int32')
-        x2.stop_gradient = False
-        out2 = paddle.squeeze(x2, axis=x3)
-        _, x2_grad = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2]
-        )[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                x1_grad,
-                x2_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_unsqueeze(self):
-        x1 = paddle.full([], 2)
-        x1.stop_gradient = False
-        out1 = paddle.unsqueeze(x1, axis=0)
-        _, x1_grad = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1]
-        )[0]
-
-        x2 = paddle.full([], 3)
-        x3 = paddle.full([], 0, dtype='int32')
-        x2.stop_gradient = False
-        out2 = paddle.unsqueeze(x2, axis=x3)
-        _, x2_grad = paddle.static.append_backward(
-            out2.sum(), parameter_list=[x2]
-        )[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                x1_grad,
-                x2_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, (1,))
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-
-    @prog_scope()
-    def test_t(self):
-        x = paddle.full([], 2.0)
-        x.stop_gradient = False
-        out = paddle.t(x)
-        grad_list = paddle.static.append_backward(out, parameter_list=[out, x])
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog, feed={}, fetch_list=[out, out.grad_name, x.grad_name]
-        )
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-
-    @prog_scope()
-    def test_sequence_pad(self):
-        x = paddle.static.data("x", [-1, 2], dtype=paddle.int64, lod_level=1)
-        value = paddle.to_tensor(1000, dtype=paddle.int64).squeeze()
-        out = paddle.static.nn.sequence_pad(x, value)
-
-        x_tensor = paddle.base.create_lod_tensor(
-            np.arange(20).astype(np.int64).reshape(-1, 2),
-            [[3, 3, 4]],
-            place=self.exe.place,
-        )
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, feed={"x": x_tensor}, fetch_list=[out])
-        self.assertEqual(res[0].shape, (3, 4, 2))
-
-    @prog_scope()
-    def test_static_data(self):
-        x1 = paddle.static.data(name="x1", shape=[])
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            feed={
-                "x1": np.array(1.0, dtype='float32'),
-            },
-            fetch_list=[
-                x1.name,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], np.array(1.0))
-
-        x2 = paddle.static.data(name="x2", shape=[])
-        x3 = paddle.static.data(name="x3", shape=[])
-        y = x2 + x3
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            feed={
-                "x2": 100.5,
-                "x3": 200.5,
-            },
-            fetch_list=[
-                y.name,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 301.0)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_prelu(self):
-        x1 = paddle.full([], 1.0, 'float32')
-        x1.stop_gradient = False
-        w1 = paddle.to_tensor([0.25], dtype='float32')
-        out1 = paddle.nn.functional.prelu(x1, w1)
-        (_, out1_grad), (_, x1_grad) = paddle.static.append_backward(
-            out1.sum(), parameter_list=[out1, x1]
-        )
-
-        x2 = paddle.full([], 1.0, 'float32')
-        x2.stop_gradient = False
-        w2 = paddle.full([], 0.25, dtype='float32')
-        out2 = paddle.nn.functional.prelu(x2, w2)
-        (_, out2_grad), (_, x2_grad) = paddle.static.append_backward(
-            out2.sum(), parameter_list=[out2, x2]
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                out2,
-                x1_grad,
-                x2_grad,
-                out1_grad,
-                out2_grad,
-            ],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, ())
-        self.assertEqual(res[4].shape, ())
-        self.assertEqual(res[5].shape, ())
-
-    @prog_scope()
-    def test_static_nn_prelu(self):
-        x1 = paddle.full([], 1.0, 'float32')
-        x1.stop_gradient = False
-        out1 = paddle.static.nn.prelu(x1, 'all')
-        grad_list = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1, out1]
-        )
-        (_, x1_grad), (_, out1_grad) = grad_list
-
-        prog = paddle.static.default_main_program()
-        self.exe.run(paddle.static.default_startup_program())
-        res = self.exe.run(
-            prog,
-            fetch_list=[
-                out1,
-                x1_grad,
-                out1_grad,
-            ],
-        )
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        np.testing.assert_allclose(res[0], np.array(1))
-        np.testing.assert_allclose(res[1], np.array(1))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_while_loop(self):
-        def cond(i, x):
-            return paddle.less_than(i, eleven)
-
-        def body(i, x):
-            x = x + i
-            i = i + 1
-            return [i, x]
-
-        main_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program, paddle.static.Program()):
-            i = paddle.static.data(name='i', shape=[], dtype='float32')
-            i.stop_gradient = False
-            i.persistable = True
-            eleven = paddle.full([], 11, 'float32')
-            x = paddle.static.data(name='x', shape=[], dtype='float32')
-            x.stop_gradient = False
-            x.persistable = True
-            out_i, out_x = paddle.static.nn.while_loop(cond, body, [i, x])
-            grad_list = paddle.static.append_backward(out_x)
-
-        feed = {
-            'i': np.array(1.0, dtype='float32'),
-            'x': np.array(0.0, dtype='float32'),
-        }
-        if paddle.framework.in_pir_mode():
-            fetch_list = [out_i, out_x]
-            for _, g in grad_list:
-                fetch_list.append(g)
-            res = self.exe.run(
-                main_program,
-                feed=feed,
-                fetch_list=fetch_list,
-            )
-        else:
-            res = self.exe.run(
-                main_program,
-                feed=feed,
-                fetch_list=[out_i.name, out_x.name, i.grad_name, x.grad_name],
-            )
-
-        self.assertEqual(res[0].shape, ())
-        np.testing.assert_allclose(res[0], np.array(11))
-        self.assertEqual(res[1].shape, ())
-        np.testing.assert_allclose(res[1], np.array(55))
-        self.assertEqual(res[2].shape, ())
-        np.testing.assert_allclose(res[2], np.array(10))
-        self.assertEqual(res[3].shape, ())
-        np.testing.assert_allclose(res[3], np.array(1.0))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_numel(self):
-        # 1) x is 0D
-        x = paddle.full([], 0.5)
-        out = paddle.numel(x)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        np.testing.assert_array_equal(res[0], np.array(1))
-
-        # 2) x is ND
-        x = paddle.full([3, 5], 0.5)
-        out = paddle.numel(x)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        np.testing.assert_array_equal(res[0], np.array(15))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_rank(self):
-        # 1) x is 0D
-        x = paddle.full([], 0.5)
-        out = paddle.rank(x)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        np.testing.assert_array_equal(res[0], np.array(0))
-
-        # 1) x is ND
-        x = paddle.full([3, 5], 0.5)
-        out = paddle.rank(x)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        self.assertEqual(res[0].shape, ())
-        np.testing.assert_array_equal(res[0], np.array(2))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_shape(self):
-        x = paddle.full([], 0.5)
-        out = paddle.shape(x)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out])
-        np.testing.assert_array_equal(res[0], np.array([]))
-        self.assertEqual(res[0].shape, (0,))
-
-    @test_with_pir_api
-    def test_broadcast_tensors(self):
-        # 1) x is 0D, y is 0D
-        x1 = paddle.full([], 2.0)
-        x1.stop_gradient = False
-        x2 = paddle.full([], 2.0)
-        x2.stop_gradient = False
-        out1, out2 = paddle.broadcast_tensors([x1, x2])
-
-        self.assertShapeEqual(out1, [])
-        self.assertShapeEqual(out2, [])
-
-        # 2) x is ND , y is 0D
-        x1 = paddle.full([2, 3], 2.0)
-        x1.stop_gradient = False
-        x2 = paddle.full([], 2.0)
-        x2.stop_gradient = False
-        out1, out2 = paddle.broadcast_tensors([x1, x2])
-
-        self.assertShapeEqual(out1, [2, 3])
-        self.assertShapeEqual(out2, [2, 3])
-
-        # 3) x is 0D , y is ND
-        x1 = paddle.full([], 2.0)
-        x1.stop_gradient = False
-        x2 = paddle.full([2, 3], 2.0)
-        x2.stop_gradient = False
-        out1, out2 = paddle.broadcast_tensors([x1, x2])
-
-        self.assertShapeEqual(out1, [2, 3])
-        self.assertShapeEqual(out2, [2, 3])
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_to_tensor(self):
-        out1 = paddle.to_tensor(1)
-        out2 = paddle.to_tensor(2.5)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out1, out2])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[0], 1)
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[1], 2.5)
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_matmul(self):
-        # 1) no transpose
-        x = paddle.randn([10])
-        x.stop_gradient = False
-        y = paddle.randn([10])
-        y.stop_gradient = False
-        out = paddle.matmul(x, y)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, y])
-        (_, x_grad), (_, y_grad) = grad_list
-
-        self.assertShapeEqual(out, [])
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad, y_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (10,))
-        self.assertEqual(res[2].shape, (10,))
-
-        # 2) transpose x and y
-        x = paddle.randn([10])
-        x.stop_gradient = False
-        y = paddle.randn([10])
-        y.stop_gradient = False
-        out = paddle.matmul(x, y, True, True)
-        grad_list = paddle.static.append_backward(out, parameter_list=[x, y])
-        (_, x_grad), (_, y_grad) = grad_list
-
-        self.assertShapeEqual(out, [])
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad, y_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (10,))
-        self.assertEqual(res[2].shape, (10,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_linalg_slogdet(self):
-        # 2-D input
-        x = paddle.randn([3, 3])
-        x.stop_gradient = False
-        out = paddle.linalg.slogdet(x)
-        _, x_grad = paddle.static.append_backward(
-            out.sum(), parameter_list=[x]
-        )[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad])
-        self.assertEqual(res[0].shape, (2,))
-        self.assertEqual(res[1].shape, (3, 3))
-
-        # 3-D input
-        x1 = paddle.randn([3, 3, 3])
-        x1.stop_gradient = False
-        out1 = paddle.linalg.slogdet(x1)
-        _, x1_grad = paddle.static.append_backward(
-            out1.sum(), parameter_list=[x1]
-        )[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out1, x1_grad])
-        self.assertEqual(res[0].shape, (2, 3))
-        self.assertEqual(res[1].shape, (3, 3, 3))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_multi_dot(self):
-        a = paddle.randn([4])
-        a.stop_gradient = False
-        b = paddle.randn([4, 5])
-        b.stop_gradient = False
-        c = paddle.randn([5])
-        c.stop_gradient = False
-
-        out = paddle.linalg.multi_dot([a, b, c])
-        grad_list = paddle.static.append_backward(
-            out.sum(), parameter_list=[a, b, c]
-        )
-        (_, a_grad), (_, b_grad), (_, c_grad) = grad_list
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, a_grad, b_grad, c_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (4,))
-        self.assertEqual(res[2].shape, (4, 5))
-        self.assertEqual(res[3].shape, (5,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_cov(self):
-        xt_1 = paddle.randn((12,))
-        xt_1.stop_gradient = False
-        out = paddle.linalg.cov(xt_1)
-        _, xt_1_grad = paddle.static.append_backward(
-            out, parameter_list=[xt_1]
-        )[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, xt_1_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (12,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_corrcoef(self):
-        x = paddle.randn((12,))
-        x.stop_gradient = False
-        out = paddle.linalg.corrcoef(x)
-        _, x_grad = paddle.static.append_backward(out, parameter_list=[x])[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (12,))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_det(self):
-        xt_1 = paddle.randn((3, 3))
-        xt_1.stop_gradient = False
-
-        out = paddle.linalg.det(xt_1)
-        _, xt_1_grad = paddle.static.append_backward(
-            out.sum(), parameter_list=[xt_1]
-        )[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, xt_1_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 3))
-
-    @prog_scope()
-    def test_dist(self):
-        x = paddle.to_tensor([[3, 3], [3, 3]], dtype="float32")
-        y = paddle.to_tensor([[3, 3], [3, 1]], dtype="float32")
-        x.stop_gradient = False
-        y.stop_gradient = False
-        out = paddle.dist(x, y)
-        (_, x_grad), (_, y_grad) = paddle.static.append_backward(
-            out, parameter_list=[x, y]
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad, y_grad])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2, 2))
-        self.assertEqual(res[1].shape, (2, 2))
-        np.testing.assert_array_equal(res[0], np.array(2).astype(np.float32))
-
-    @prog_scope()
-    def test_linalg_norm(self):
-        # 1D input, p = fro ,axis = None, using reduceInferMeta
-        x_1 = paddle.arange(24, dtype="float32") - 12
-        x_1.stop_gradient = False
-        out_1 = paddle.linalg.norm(x_1)
-        grad_list = paddle.static.append_backward(out_1, parameter_list=[x_1])
-        ((_, x_1_grad),) = grad_list
-
-        prog = paddle.static.default_main_program()
-
-        res = self.exe.run(prog, fetch_list=[out_1, x_1_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (24,))
-
-        # 1D input, p = 1 ,axis = None,
-        # using p_norm, as_vector = True
-        x_2 = paddle.arange(24, dtype="float32") - 12
-        x_2.stop_gradient = False
-        out_2 = paddle.linalg.norm(x_2, p=1)
-        paddle.static.append_backward(out_2.sum())
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_2, x_2.grad_name])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (24,))
-
-        # 1D input, p = 1 ,axis = 0,
-        # using p_norm, as_vector = False
-        x_2_p = paddle.arange(24, dtype="float32") - 12
-        x_2_p.stop_gradient = False
-        out_2_p = paddle.linalg.norm(x_2_p, p=1, axis=0)
-        paddle.static.append_backward(out_2_p.sum())
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_2_p, x_2_p.grad_name])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (24,))
-
-        # 1D input, p = fro ,axis = 0,
-        # using p_norm, as_vector = False
-        x_2_fro = paddle.arange(24, dtype="float32") - 12
-        x_2_fro.stop_gradient = False
-        out_2_fro = paddle.linalg.norm(x_2_fro, p="fro", axis=0)
-        paddle.static.append_backward(out_2_fro.sum())
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_2_fro, x_2_fro.grad_name])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (24,))
-
-        # 2D input, p = 1, axis = [0, 1]
-        # using p_matrix_norm, depends on paddle.sum
-        x_3 = paddle.arange(24, dtype="float32").reshape([4, 6])
-        x_3.stop_gradient = False
-        out_3 = paddle.linalg.norm(x_3, p=1, axis=[0, 1])
-        paddle.static.append_backward(out_3.sum())
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_3, x_3.grad_name])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (4, 6))
-
-        # 2D input, p = 1, axis = None
-        # using p_matrix_norm, depends on paddle.sum
-        x_4 = paddle.arange(24, dtype="float32").reshape([4, 6])
-        x_4.stop_gradient = False
-        out_4 = paddle.linalg.norm(x_4)
-        paddle.static.append_backward(out_4.sum())
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_4, x_4.grad_name])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (4, 6))
-
-        # 2D input, p = inf, axis = None
-        x_5 = paddle.arange(24, dtype="float32").reshape([4, 6])
-        x_5.stop_gradient = False
-        out_5 = paddle.linalg.norm(x_5)
-        paddle.static.append_backward(out_5.sum())
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_5, x_5.grad_name])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (4, 6))
-
-        # 2D input, p = -inf, axis = [0, 1]
-        x_6 = paddle.arange(24, dtype="float32").reshape([4, 6])
-        x_6.stop_gradient = False
-        out_6 = paddle.linalg.norm(x_6, p=-float("inf"), axis=[0, 1])
-        paddle.static.append_backward(out_6.sum())
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_6, x_6.grad_name])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (4, 6))
-
-    @test_with_pir_api
-    @prog_scope()
-    def test_linalg_cond(self):
-        # use paddle.sum
-        x = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x.stop_gradient = False
-        out = paddle.linalg.cond(x)
-        _, x_grad = paddle.static.append_backward(out, parameter_list=[x])[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 3))
-
-        # p = fro : use paddle.sum
-        x2 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x2.stop_gradient = False
-        out_fro = paddle.linalg.cond(x2, p='fro')
-        grad_list = paddle.static.append_backward(out_fro, parameter_list=[x2])
-        ((_, x2_grad),) = grad_list
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_fro, x2_grad])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 3))
-
-        # p = nuc : use paddle.sum
-        x3 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x3.stop_gradient = False
-        out_nuc = paddle.linalg.cond(x3, p='nuc')
-        _, x3_grad = paddle.static.append_backward(
-            out_nuc, parameter_list=[x3]
-        )[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_nuc, x3_grad])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 3))
-
-        # p in (-1, 1) : use paddle.sum
-        x4 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x4.stop_gradient = False
-        out_1 = paddle.linalg.cond(x4, p=1)
-        _, x4_grad = paddle.static.append_backward(out_1, parameter_list=[x4])[
-            0
-        ]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_1, x4_grad])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 3))
-
-        x5 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x5.stop_gradient = False
-        out_minus_1 = paddle.linalg.cond(x5, p=-1)
-        ((_, x5_grad),) = paddle.static.append_backward(
-            out_minus_1, parameter_list=[x5]
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_minus_1, x5_grad])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 3))
-
-        # p in (-2, 2) depends on paddle.sum
-        x6 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x6.stop_gradient = False
-        out_2 = paddle.linalg.cond(x6, p=2)
-        ((_, x6_grad),) = paddle.static.append_backward(
-            out_2, parameter_list=[x6]
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_2, x6_grad])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 3))
-
-        # p in (-inf, inf):use paddle.sum
-        x8 = paddle.to_tensor([[1.0, 0, -1], [0, 1, 0], [1, 0, 1]])
-        x8.stop_gradient = False
-        out_inf = paddle.linalg.cond(x8, p=float("inf"))
-        ((_, x8_grad),) = paddle.static.append_backward(
-            out_inf, parameter_list=[x8]
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out_inf, x8_grad])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 3))
-
-        # depends on paddle.sum
-        a = paddle.randn([2, 4, 4])
-        a.stop_gradient = False
-        a_cond_fro = paddle.linalg.cond(a, p='fro')
-        ((_, a_grad),) = paddle.static.append_backward(
-            a_cond_fro.sum(), parameter_list=[a]
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[a_cond_fro, a_grad])
-
-        self.assertEqual(res[0].shape, (2,))
-        self.assertEqual(res[1].shape, (2, 4, 4))
-
-    @prog_scope()
-    def test_trace(self):
-        x = paddle.to_tensor([[3, 2], [1, 9]], dtype="float32")
-        x.stop_gradient = False
-        out = paddle.trace(x)
-        _, x_grad = paddle.static.append_backward(out, parameter_list=[x])[0]
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[out, x_grad])
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2, 2))
-        np.testing.assert_allclose(res[0], np.array(12))
-
-
-# Use to test API whose zero-dim input tensors don't have grad and not need to test backward in OpTest.
-class TestNoBackwardAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.disable_static()
-        self.shape = [
-            paddle.full([], 2, 'int32'),
-            paddle.full([], 3, 'int32'),
-            paddle.full([], 4, 'int32'),
-        ]
-
-    def test_slice(self):
-        starts = [paddle.full([], 1, 'int32'), paddle.full([], 1, 'int32')]
-        ends = [paddle.full([], 3, 'int32'), paddle.full([], 3, 'int32')]
-        x = paddle.rand([5, 3, 3])
-        out = paddle.slice(x, [1, 2], starts, ends)
-        self.assertEqual(out.shape, [5, 2, 2])
-
-    def test_strided_slice(self):
-        starts = [paddle.full([], 0, 'int32'), paddle.full([], 0, 'int32')]
-        ends = [paddle.full([], 4, 'int32'), paddle.full([], 4, 'int32')]
-        strides = [paddle.full([], 2, 'int32'), paddle.full([], 2, 'int32')]
-        x = paddle.rand([5, 5, 5])
-        out = paddle.strided_slice(x, [1, 2], starts, ends, strides)
-        self.assertEqual(out.shape, [5, 2, 2])
-
-    def test_linspace(self):
-        start = paddle.full([], 1.0)
-        stop = paddle.full([], 5.0)
-        num = paddle.full([], 5, 'int32')
-        out = paddle.linspace(start, stop, num)
-        np.testing.assert_array_equal(out.numpy(), [1.0, 2.0, 3.0, 4.0, 5.0])
-
-    def test_logspace(self):
-        start = paddle.full([], 1.0)
-        stop = paddle.full([], 3.0)
-        num = paddle.full([], 5, 'int32')
-        base = paddle.full([], 2.0)
-        out = paddle.logspace(start, stop, num, base)
-        self.assertEqual(out.shape, [5])
-
-    def test_arange(self):
-        start = paddle.full([], 1.0)
-        stop = paddle.full([], 6.0)
-        step = paddle.full([], 1.0)
-        out = paddle.arange(start, stop, step)
-        np.testing.assert_array_equal(out.numpy(), [1.0, 2.0, 3.0, 4.0, 5.0])
-
-    def test_normal(self):
-        mean = paddle.full([], 0.0)
-        std = paddle.full([], 0.0)
-        out = paddle.normal(mean, std)
-        self.assertEqual(out.shape, [])
-
-        out = paddle.normal(0.0, 1.0, [])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.normal(0.0, 1.0, self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_rand(self):
-        out = paddle.rand([])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.rand(self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_randn(self):
-        out = paddle.randn([])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.randn(self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_randint_and_randint_like(self):
-        out = paddle.randint(-10, 10, [])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.randint_like(out, -10, 10)
-        self.assertEqual(out.shape, [])
-
-        out = paddle.randint(-10, 10, self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_standard_normal(self):
-        out = paddle.standard_normal([])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.standard_normal(self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_uniform(self):
-        out = paddle.uniform([])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.uniform(self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_empty_and_empty_like(self):
-        out = paddle.empty([])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.empty_like(out)
-        self.assertEqual(out.shape, [])
-
-        out = paddle.empty(self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_full_and_full_like(self):
-        out = paddle.full([], 0.5)
-        self.assertEqual(out.shape, [])
-
-        out = paddle.full_like(out, 0.5)
-        self.assertEqual(out.shape, [])
-
-        out = paddle.full(self.shape, 0.5)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_ones_and_ones_like(self):
-        out = paddle.ones([])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.ones_like(out)
-        self.assertEqual(out.shape, [])
-
-        out = paddle.ones(self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_zeros_and_zeros_like(self):
-        out = paddle.zeros([])
-        self.assertEqual(out.shape, [])
-
-        out = paddle.zeros_like(out)
-        self.assertEqual(out.shape, [])
-
-        out = paddle.zeros(self.shape)
-        self.assertEqual(out.shape, [2, 3, 4])
-
-    def test_embedding(self):
-        ids = paddle.full(shape=[], fill_value=1, dtype='int64')
-        w0 = paddle.arange(3, 9).reshape((3, 2)).astype(paddle.float32)
-        w = paddle.to_tensor(w0, stop_gradient=False)
-        emb = paddle.nn.functional.embedding(
-            x=ids, weight=w, sparse=True, name="embedding"
-        )
-        self.assertEqual(emb.shape, [2])
-        res = [5.0, 6.0]
-        for i in range(len(res)):
-            self.assertEqual(emb.numpy()[i], res[i])
-
-    def test_one_hot_label(self):
-        label = paddle.full(shape=[], fill_value=2, dtype='int64')
-        one_hot_label = paddle.nn.functional.one_hot(label, num_classes=4)
-        self.assertEqual(one_hot_label.shape, [4])
-        self.assertEqual(one_hot_label.numpy()[2], 1)
-
-    def test_unique_consecutive(self):
-        places = ['cpu']
-        if paddle.is_compiled_with_cuda():
-            places.append('gpu')
-        for place in places:
-            paddle.set_device(place)
-            x = paddle.rand([])
-            y, inverse, counts = paddle.unique_consecutive(
-                x,
-                return_inverse=True,
-                return_counts=True,
-            )
-
-            self.assertEqual(y, x)
-            self.assertEqual(inverse, 0)
-            self.assertEqual(counts, 1)
-            self.assertEqual(y.shape, [1])
-            self.assertEqual(inverse.shape, [1])
-            self.assertEqual(counts.shape, [1])
-
-    def test_unique(self):
-        places = ['cpu']
-        if paddle.is_compiled_with_cuda():
-            places.append('gpu')
-        for place in places:
-            paddle.set_device(place)
-            x = paddle.rand([])
-            y, index, inverse, counts = paddle.unique(
-                x,
-                return_index=True,
-                return_inverse=True,
-                return_counts=True,
-            )
-
-            self.assertEqual(y, x)
-            self.assertEqual(index, 0)
-            self.assertEqual(inverse, 0)
-            self.assertEqual(counts, 1)
-            self.assertEqual(y.shape, [1])
-            self.assertEqual(index.shape, [1])
-            self.assertEqual(inverse.shape, [1])
-            self.assertEqual(counts.shape, [1])
-
-    def test_matrix_rank(self):
-        x = paddle.eye(10)
-        x.stop_gradient = False
-        out = paddle.linalg.matrix_rank(x)
-
-        self.assertEqual(out.shape, [])
-        np.testing.assert_equal(out, np.array(10))
-
-        c = paddle.ones(shape=[3, 4, 5])
-        c.stop_gradient = False
-        out_c = paddle.linalg.matrix_rank(c)
-        self.assertEqual(out_c.shape, [3])
-        np.testing.assert_equal(out_c, np.array([1, 1, 1]))
-
-        # 2D, tol->float : OUTPUT 0D
-        x_tol = paddle.eye(10)
-        x_tol.stop_gradient = False
-        out_tol = paddle.linalg.matrix_rank(x_tol, tol=0.1)
-        self.assertEqual(out_tol.shape, [])
-
-        # 3D, tol->float : OUTPUT 1D
-        c_tol = paddle.ones(shape=[3, 4, 5])
-        c_tol.stop_gradient = False
-        out_c_tol = paddle.linalg.matrix_rank(c_tol, tol=0.1)
-        self.assertEqual(out_c_tol.shape, [3])
-
-        tol_2 = paddle.randn([2])
-        # 2D, tol->Tensor[1,2] : OUTPUT 1D
-        d = paddle.eye(10)
-        out_d = paddle.linalg.matrix_rank(d, tol=tol_2)
-        self.assertEqual(out_d.shape, [2])
-
-
-class TestNoBackwardAPIStatic(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-        self.exe = paddle.static.Executor()
-        self.shape = [
-            paddle.full([], 2, 'int32'),
-            paddle.full([], 3, 'int32'),
-            paddle.full([], 4, 'int32'),
-        ]
-
-    def test_slice(self):
-        starts = [paddle.full([], 1, 'int32'), paddle.full([], 1, 'int32')]
-        ends = [paddle.full([], 3, 'int32'), paddle.full([], 3, 'int32')]
-        x = paddle.rand([5, 3, 3])
-        out = paddle.slice(x, [1, 2], starts, ends)
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out]
-        )[0]
-        self.assertEqual(res.shape, (5, 2, 2))
-
-    @test_with_pir_api
-    def test_strided_slice(self):
-        starts = [paddle.full([], 0, 'int32'), paddle.full([], 0, 'int32')]
-        ends = [paddle.full([], 4, 'int32'), paddle.full([], 4, 'int32')]
-        strides = [paddle.full([], 2, 'int32'), paddle.full([], 2, 'int32')]
-        x = paddle.rand([5, 5, 5])
-        out = paddle.strided_slice(x, [1, 2], starts, ends, strides)
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out]
-        )[0]
-        self.assertEqual(res.shape, (5, 2, 2))
-
-    def test_linspace(self):
-        start = paddle.full([], 1.0)
-        stop = paddle.full([], 5.0)
-        num = paddle.full([], 5, 'int32')
-        out = paddle.linspace(start, stop, num)
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out]
-        )[0]
-        np.testing.assert_array_equal(res, [1.0, 2.0, 3.0, 4.0, 5.0])
-
-    @test_with_pir_api
-    def test_arange(self):
-        start = paddle.full([], 1.0)
-        stop = paddle.full([], 6.0)
-        step = paddle.full([], 1.0)
-        out = paddle.arange(start, stop, step)
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out]
-        )[0]
-        np.testing.assert_array_equal(res, [1.0, 2.0, 3.0, 4.0, 5.0])
-
-    def test_normal(self):
-        mean = paddle.full([], 0.0)
-        std = paddle.full([], 0.0)
-        out1 = paddle.normal(mean, std)
-        out2 = paddle.normal(0.0, 1.0, [])
-        out3 = paddle.normal(0.0, 1.0, self.shape)
-
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out1, out2, out3]
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (2, 3, 4))
-
-    def test_rand(self):
-        out1 = paddle.rand([])
-        out2 = paddle.rand(self.shape)
-
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out1, out2]
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2, 3, 4))
-
-    def test_randn(self):
-        out1 = paddle.randn([])
-        out2 = paddle.randn(self.shape)
-
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out1, out2]
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2, 3, 4))
-
-    @test_with_pir_api
-    def test_randint(self):
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            out1 = paddle.randint(-10, 10, [])
-
-            shape = [
-                paddle.full([], 2, 'int32'),
-                paddle.full([], 3, 'int32'),
-                paddle.full([], 4, 'int32'),
-            ]
-            out2 = paddle.randint(-10, 10, shape)
-
-            res = self.exe.run(
-                paddle.static.default_main_program(), fetch_list=[out1, out2]
-            )
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2, 3, 4))
-
-    @test_with_pir_api
-    def test_randint_like(self):
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            out1 = paddle.rand([])
-            out2 = paddle.randint_like(out1, -10, 10)
-
-            res = self.exe.run(
-                paddle.static.default_main_program(), fetch_list=[out1, out2]
-            )
-
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-
-    def test_standard_normal(self):
-        out1 = paddle.standard_normal([])
-        out2 = paddle.standard_normal(self.shape)
-
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out1, out2]
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2, 3, 4))
-
-    def test_uniform(self):
-        out1 = paddle.uniform([])
-        out2 = paddle.uniform(self.shape)
-
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out1, out2]
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (2, 3, 4))
-
-    def test_empty_and_empty_like(self):
-        out1 = paddle.empty([])
-        out2 = paddle.empty_like(out1)
-        out3 = paddle.empty(self.shape)
-
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out1, out2, out3]
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (2, 3, 4))
-
-    def test_full_and_full_like(self):
-        out1 = paddle.full([], 0.5)
-        out2 = paddle.full_like(out1, 0.5)
-        out3 = paddle.full(self.shape, 0.5)
-        out4 = paddle.full(self.shape, paddle.full([], 0.5))
-
-        res = self.exe.run(
-            paddle.static.default_main_program(),
-            fetch_list=[out1, out2, out3, out4],
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (2, 3, 4))
-        self.assertEqual(res[3].shape, (2, 3, 4))
-
-    def test_ones_and_ones_like(self):
-        out1 = paddle.ones([])
-        out2 = paddle.ones_like(out1)
-        out3 = paddle.ones(self.shape)
-
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out1, out2, out3]
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (2, 3, 4))
-
-    def test_zeros_and_zeros_like(self):
-        out1 = paddle.zeros([])
-        out2 = paddle.zeros_like(out1)
-        out3 = paddle.zeros(self.shape)
-
-        res = self.exe.run(
-            paddle.static.default_main_program(), fetch_list=[out1, out2, out3]
-        )
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, (2, 3, 4))
-
-    def test_embedding(self):
-        ids = paddle.full(shape=[], fill_value=1, dtype='int64')
-        w0 = paddle.arange(3, 9).reshape((3, 2)).astype(paddle.float32)
-        w = paddle.to_tensor(w0, stop_gradient=False)
-        emb = paddle.nn.functional.embedding(
-            x=ids, weight=w, sparse=True, name="embedding"
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[emb])
-        self.assertEqual(res[0].shape, (2,))
-        result = [5.0, 6.0]
-        for i in range(len(res)):
-            self.assertEqual(res[0][i], result[i])
-
-    def test_static_embedding(self):
-        ids = paddle.full(shape=[], fill_value=1, dtype='int64')
-        emb = paddle.static.nn.embedding(ids, (20, 3))
-        prog = paddle.static.default_main_program()
-        self.exe.run(paddle.static.default_startup_program())
-        res = self.exe.run(prog, fetch_list=[emb])
-        self.assertEqual(res[0].shape, (3,))
-
-    @test_with_pir_api
-    def test_one_hot_label(self):
-        label = paddle.full(shape=[], fill_value=2, dtype='int64')
-        one_hot_label = paddle.nn.functional.one_hot(label, num_classes=4)
-        prog = paddle.static.default_main_program()
-        self.exe.run(paddle.static.default_startup_program())
-        res = self.exe.run(prog, fetch_list=[one_hot_label])
-
-        self.assertEqual(res[0].shape, (4,))
-        self.assertEqual(res[0][2], 1)
-
-    def test_unique_consecutive(self):
-        x = paddle.rand([])
-        y, inverse, counts = paddle.unique_consecutive(
-            x, return_inverse=True, return_counts=True
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[y, inverse, counts])
-        self.assertEqual(y, x)
-        self.assertEqual(inverse, 0)
-        self.assertEqual(counts, 1)
-        self.assertEqual(res[0].shape, (1,))
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[2].shape, (1,))
-
-    def test_unique(self):
-        x = paddle.rand([])
-        y, index, inverse, counts = paddle.unique(
-            x, return_index=True, return_inverse=True, return_counts=True
-        )
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[y, index, inverse, counts])
-        self.assertEqual(y, x)
-        self.assertEqual(index, 0)
-        self.assertEqual(inverse, 0)
-        self.assertEqual(counts, 1)
-        self.assertEqual(res[0].shape, (1,))
-        self.assertEqual(res[1].shape, (1,))
-        self.assertEqual(res[2].shape, (1,))
-        self.assertEqual(res[3].shape, (1,))
-
-    @test_with_pir_api
-    def test_static_matrix_rank(self):
-        # 2D : OUTPUT 0D
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            x = paddle.eye(10)
-            x.stop_gradient = False
-            out = paddle.linalg.matrix_rank(x)
-            exe = paddle.static.Executor()
-            res = exe.run(fetch_list=[out])
-            self.assertEqual(res[0].shape, ())
-
-        # 3D : OUTPUT 1D
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            c = paddle.ones(shape=[3, 4, 5])
-            c.stop_gradient = False
-            out_c = paddle.linalg.matrix_rank(c)
-            exe = paddle.static.Executor()
-            res = exe.run(fetch_list=[out_c])
-            self.assertEqual(res[0].shape, (3,))
-
-        # 2D, tol->float : OUTPUT 0D
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            x_tol = paddle.eye(10)
-            x_tol.stop_gradient = False
-            out_tol = paddle.linalg.matrix_rank(x_tol, tol=0.1)
-            exe = paddle.static.Executor()
-            res = exe.run(fetch_list=[out_tol])
-            self.assertEqual(res[0].shape, ())
-
-        # 3D, tol->float : OUTPUT 1D
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            c_tol = paddle.ones(shape=[3, 4, 5])
-            c_tol.stop_gradient = False
-            out_c_tol = paddle.linalg.matrix_rank(c_tol, tol=0.1)
-            exe = paddle.static.Executor()
-            res = exe.run(fetch_list=[out_c_tol])
-            self.assertEqual(res[0].shape, (3,))
-
-        # 2D, tol->Tensor[1,2] : OUTPUT 1D
-        with paddle.static.program_guard(
-            paddle.static.Program(), paddle.static.Program()
-        ):
-            tol_2 = paddle.randn([2])
-            d = paddle.eye(10)
-            out_d = paddle.linalg.matrix_rank(d, tol=tol_2)
-            exe = paddle.static.Executor()
-            res = exe.run(fetch_list=[out_d])
-            self.assertEqual(res[0].shape, (2,))
-
-
-unary_apis_with_complex_input = [
-    paddle.real,
-    paddle.imag,
-    paddle.angle,
-    paddle.conj,
-]
-
-
-class TestUnaryElementwiseAPIWithComplexInput(unittest.TestCase):
-    def test_dygraph_unary(self):
-        paddle.disable_static()
-        for api in unary_apis_with_complex_input:
-            x = paddle.rand([]) + 1j * paddle.rand([])
-            x.stop_gradient = False
-            x.retain_grads()
-            out = api(x)
-            out.retain_grads()
-            out.backward()
-
-            self.assertEqual(x.shape, [])
-            self.assertEqual(out.shape, [])
-            if x.grad is not None:
-                self.assertEqual(x.grad.shape, [])
-                self.assertEqual(out.grad.shape, [])
-
-        paddle.enable_static()
-
-    def test_static_unary(self):
-        paddle.enable_static()
-        for api in unary_apis_with_complex_input:
-            main_prog = paddle.static.Program()
-            block = main_prog.global_block()
-            exe = paddle.static.Executor()
-            with paddle.static.program_guard(
-                main_prog, paddle.static.Program()
-            ):
-                x = paddle.complex(paddle.rand([]), paddle.rand([]))
-                x.stop_gradient = False
-                out = api(x)
-                paddle.static.append_backward(out)
-
-                fetch_list = [x, out]
-                if block.has_var(x.grad_name):
-                    fetch_list.extend([x.grad_name, out.grad_name])
-
-                # 1) Test Program
-                res = exe.run(main_prog, fetch_list=fetch_list)
-                for item in res:
-                    self.assertEqual(item.shape, ())
-
-                # 2) Test CompiledProgram Program
-                compile_prog = paddle.static.CompiledProgram(main_prog)
-                res = exe.run(compile_prog, fetch_list=fetch_list)
-                for item in res:
-                    self.assertEqual(item.shape, ())
-
-        paddle.disable_static()
-
-
-class TestAsReal(unittest.TestCase):
-    def test_dygraph(self):
-        paddle.disable_static()
-        x = paddle.rand([]) + 1j * paddle.rand([])
-        x.stop_gradient = False
-        x.retain_grads()
-        out = paddle.as_real(x)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.shape, [])
-        self.assertEqual(out.shape, [2])
-        if x.grad is not None:
-            self.assertEqual(x.grad.shape, [])
-            self.assertEqual(out.grad.shape, [2])
-
-        paddle.enable_static()
-
-    def test_static(self):
-        paddle.enable_static()
-
-        main_prog = paddle.static.Program()
-        block = main_prog.global_block()
-        exe = paddle.static.Executor()
-        with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.complex(paddle.rand([]), paddle.rand([]))
-            x.stop_gradient = False
-            out = paddle.as_real(x)
-            self.assertEqual(x.shape, ())
-            self.assertEqual(out.shape, (2,))
-            paddle.static.append_backward(out.sum())
-
-            fetch_list = [x, out]
-            if block.has_var(x.grad_name):
-                fetch_list.extend([x.grad_name, out.grad_name])
-
-            res = exe.run(main_prog, fetch_list=fetch_list)
-            self.assertEqual(res[0].shape, ())
-            self.assertEqual(res[1].shape, (2,))
-            self.assertEqual(res[2].shape, ())
-            self.assertEqual(res[3].shape, (2,))
-
-        paddle.disable_static()
-
-
-class TestAsComplex(unittest.TestCase):
-    def test_dygraph(self):
-        paddle.disable_static()
-        x = paddle.rand([2])
-        x.stop_gradient = False
-        x.retain_grads()
-        out = paddle.as_complex(x)
-        out.retain_grads()
-        out.backward()
-
-        self.assertEqual(x.shape, [2])
-        self.assertEqual(out.shape, [])
-        if x.grad is not None:
-            self.assertEqual(x.grad.shape, [2])
-            self.assertEqual(out.grad.shape, [])
-
-        paddle.enable_static()
-
-    def test_static(self):
-        paddle.enable_static()
-        main_prog = paddle.static.Program()
-        block = main_prog.global_block()
-        exe = paddle.static.Executor()
-        with paddle.static.program_guard(main_prog, paddle.static.Program()):
-            x = paddle.rand([2])
-            x.stop_gradient = False
-            out = paddle.as_complex(x)
-            self.assertEqual(x.shape, (2,))
-            self.assertEqual(out.shape, ())
-            paddle.static.append_backward(out.sum())
-
-            fetch_list = [x, out]
-            if block.has_var(x.grad_name):
-                fetch_list.extend([x.grad_name, out.grad_name])
-
-            res = exe.run(main_prog, fetch_list=fetch_list)
-            self.assertEqual(res[0].shape, (2,))
-            self.assertEqual(res[1].shape, ())
-            self.assertEqual(res[2].shape, (2,))
-            self.assertEqual(res[3].shape, ())
-
-        paddle.disable_static()
-
-
-class TestDistribution(unittest.TestCase):
-    def setUp(self):
-        self.x = paddle.full([], 2.0)
-
-    def test_Bernoulli(self):
-        d = paddle.distribution.Bernoulli(probs=0.3)
-        self.assertEqual(d.mean.shape, [])
-        self.assertEqual(d.variance.shape, [])
-        self.assertEqual(d.entropy().shape, [])
-        self.assertEqual(d.sample([]).shape, [])
-        self.assertEqual(d.rsample([]).shape, [])
-        self.assertEqual(d.cdf(self.x).shape, [])
-        self.assertEqual(d.prob(self.x).shape, [])
-        self.assertEqual(d.log_prob(self.x).shape, [])
-
-        d_other = paddle.distribution.Bernoulli(probs=0.7)
-        self.assertEqual(d.kl_divergence(d_other).shape, [])
-
-    def test_Geometric(self):
-        d = paddle.distribution.Geometric(0.5)
-        self.assertEqual(d.mean.shape, [])
-        self.assertEqual(d.variance.shape, [])
-        self.assertEqual(d.entropy().shape, [])
-        self.assertEqual(d.stddev.shape, [])
-        self.assertEqual(d.pmf(self.x).shape, [])
-        self.assertEqual(d.log_pmf(self.x).shape, [])
-        self.assertEqual(d.sample([]).shape, [])
-        self.assertEqual(d.rsample([]).shape, [])
-        self.assertEqual(d.cdf(self.x).shape, [])
-
-        d_other = paddle.distribution.Geometric(probs=0.7)
-        self.assertEqual(d.kl_divergence(d_other).shape, [])
-
-    def test_Cauchy(self):
-        d = paddle.distribution.Cauchy(loc=0.1, scale=1.2)
-        self.assertEqual(d.sample([]).shape, [])
-        self.assertEqual(d.rsample([]).shape, [])
-        self.assertEqual(d.prob(self.x).shape, [])
-        self.assertEqual(d.log_prob(self.x).shape, [])
-        self.assertEqual(d.cdf(self.x).shape, [])
-        self.assertEqual(d.entropy().shape, [])
-
-        d_other = paddle.distribution.Cauchy(
-            loc=paddle.to_tensor(1.2), scale=paddle.to_tensor(2.3)
-        )
-        self.assertEqual(d.kl_divergence(d_other).shape, [])
-
-    def test_Categorical(self):
-        logits = paddle.rand([6])
-        d = paddle.distribution.Categorical(logits)
-        self.assertEqual(d.sample([]).shape, [])
-        self.assertEqual(d.probs(paddle.full([], 2, dtype='int64')).shape, [])
-        self.assertEqual(
-            d.log_prob(paddle.full([], 2, dtype='int64')).shape, []
-        )
-        self.assertEqual(d.entropy().shape, [])
-
-    def test_Normal(self):
-        normal = paddle.distribution.Normal(0.0, 3.0)
-        self.assertEqual(normal.sample([]).shape, [])
-        self.assertEqual(normal.rsample([]).shape, [])
-        self.assertEqual(normal.mean.shape, [])
-        self.assertEqual(normal.variance.shape, [])
-        self.assertEqual(normal.probs(self.x).shape, [])
-        self.assertEqual(normal.log_prob(self.x).shape, [])
-        self.assertEqual(normal.entropy().shape, [])
-
-        normal = paddle.distribution.Normal(
-            paddle.full([], 0.0), paddle.full([], 3.0)
-        )
-        self.assertEqual(normal.sample([]).shape, [])
-        self.assertEqual(normal.rsample([]).shape, [])
-        self.assertEqual(normal.mean.shape, [])
-        self.assertEqual(normal.variance.shape, [])
-        self.assertEqual(normal.probs(self.x).shape, [])
-        self.assertEqual(normal.log_prob(self.x).shape, [])
-        self.assertEqual(normal.entropy().shape, [])
-
-    def test_Uniform(self):
-        uniform = paddle.distribution.Uniform(0.0, 1.0)
-        self.assertEqual(uniform.sample([]).shape, [])
-        self.assertEqual(uniform.probs(self.x).shape, [])
-        self.assertEqual(uniform.log_prob(self.x).shape, [])
-        self.assertEqual(uniform.entropy().shape, [])
-
-        uniform = paddle.distribution.Uniform(
-            paddle.full([], 0.0), paddle.full([], 1.0)
-        )
-        self.assertEqual(uniform.sample([]).shape, [])
-        self.assertEqual(uniform.probs(self.x).shape, [])
-        self.assertEqual(uniform.log_prob(self.x).shape, [])
-        self.assertEqual(uniform.entropy().shape, [])
-
-    def test_Beta(self):
-        beta = paddle.distribution.Beta(alpha=0.5, beta=0.5)
-        self.assertEqual(beta.sample([]).shape, [])
-        self.assertEqual(beta.mean.shape, [])
-        self.assertEqual(beta.variance.shape, [])
-        self.assertEqual(beta.prob(self.x).shape, [])
-        self.assertEqual(beta.log_prob(self.x).shape, [])
-        self.assertEqual(beta.entropy().shape, [])
-
-    def test_kl_divergence(self):
-        p = paddle.distribution.Beta(alpha=0.5, beta=0.5)
-        q = paddle.distribution.Beta(alpha=0.2, beta=1.0)
-        kl = paddle.distribution.kl_divergence(p, q)
-        self.assertEqual(kl.shape, [])
-
-    def test_TransformedDistribution(self):
-        d = paddle.distribution.TransformedDistribution(
-            paddle.distribution.Normal(0.0, 1.0),
-            [
-                paddle.distribution.AffineTransform(
-                    paddle.full([], 1.0), paddle.full([], 2.0)
-                )
-            ],
-        )
-        self.assertEqual(d.sample([]).shape, [])
-        self.assertEqual(d.rsample([]).shape, [])
-        self.assertEqual(d.prob(self.x).shape, [])
-        self.assertEqual(d.log_prob(self.x).shape, [])
-
-    def test_Laplace(self):
-        d = paddle.distribution.Laplace(0.0, 1.0)
-        self.assertEqual(d.sample([]).shape, [])
-        self.assertEqual(d.rsample([]).shape, [])
-        self.assertEqual(d.mean.shape, [])
-        self.assertEqual(d.stddev.shape, [])
-        self.assertEqual(d.variance.shape, [])
-        self.assertEqual(d.prob(self.x).shape, [])
-        self.assertEqual(d.log_prob(self.x).shape, [])
-        self.assertEqual(d.cdf(self.x).shape, [])
-        self.assertEqual(d.icdf(self.x).shape, [])
-        self.assertEqual(d.entropy().shape, [])
-
-    def test_LogNormal(self):
-        d = paddle.distribution.LogNormal(0.0, 1.0)
-        self.assertEqual(d.sample([]).shape, [])
-        self.assertEqual(d.mean.shape, [])
-        self.assertEqual(d.variance.shape, [])
-        self.assertEqual(d.entropy().shape, [])
-        self.assertEqual(d.probs(self.x).shape, [])
-
-    def test_Gumbel(self):
-        d = paddle.distribution.Gumbel(0.0, 1.0)
-        self.assertEqual(d.sample([]).shape, [])
-        self.assertEqual(d.rsample([]).shape, [])
-        self.assertEqual(d.mean.shape, [])
-        self.assertEqual(d.variance.shape, [])
-        self.assertEqual(d.stddev.shape, [])
-        self.assertEqual(d.prob(self.x).shape, [])
-        self.assertEqual(d.log_prob(self.x).shape, [])
-        self.assertEqual(d.cdf(self.x).shape, [])
-        self.assertEqual(d.entropy().shape, [])
-
-    def test_Multinomial(self):
-        d = paddle.distribution.Multinomial(
-            10, paddle.to_tensor([0.2, 0.3, 0.5])
-        )
-        self.assertEqual(d.prob(self.x).shape, [])
-        self.assertEqual(d.log_prob(self.x).shape, [])
-        self.assertEqual(d.entropy().shape, [])
-
-
-class TestLossAPI(unittest.TestCase):
-    def test_sigmoid_focal_loss(self):
-        logit = paddle.to_tensor(
-            [[0.97, 0.91, 0.03], [0.55, 0.43, 0.71]],
-            dtype='float32',
-            stop_gradient=False,
-        )
-        logit.retain_grads()
-        label = paddle.to_tensor(
-            [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], dtype='float32'
-        )
-        fg_num_0 = paddle.full([], 2.0)
-        fg_num_1 = paddle.full([1], 2.0)
-
-        out0 = F.sigmoid_focal_loss(
-            logit, label, normalizer=fg_num_0, reduction='sum'
-        )
-        out1 = F.sigmoid_focal_loss(
-            logit, label, normalizer=fg_num_1, reduction='sum'
-        )
-        out0.retain_grads()
-
-        np.testing.assert_array_equal(
-            out0.numpy(),
-            out1.numpy(),
-        )
-
-        out0.backward()
-        self.assertEqual(out0.shape, [])
-        self.assertEqual(out1.shape, [])
-        self.assertEqual(out0.grad.shape, [])
-        self.assertEqual(logit.grad.shape, [2, 3])
-
-    def test_cross_entropy(self):
-        input = paddle.rand([3, 5])
-        input.stop_gradient = False
-        label = paddle.randint(0, 5, shape=[3])
-
-        loss = paddle.nn.functional.cross_entropy(input, label, reduction='sum')
-        loss.backward()
-
-        self.assertEqual(loss.shape, [])
-        self.assertEqual(input.grad.shape, [3, 5])
-
-    def test_l1_loss(self):
-        input = paddle.rand([3, 5])
-        input.stop_gradient = False
-        label = paddle.rand([3, 5])
-
-        loss = paddle.nn.functional.l1_loss(input, label, reduction='mean')
-        loss.backward()
-
-        self.assertEqual(loss.shape, [])
-        self.assertEqual(input.grad.shape, [3, 5])
-
-    def test_nll_loss(self):
-        input = paddle.rand([5, 3])
-        input.stop_gradient = False
-        log_softmax = paddle.nn.LogSoftmax(axis=1)
-        log_out = log_softmax(input)
-        label = paddle.randint(0, 3, [5], "int64")
-
-        loss = paddle.nn.functional.nll_loss(log_out, label)
-        loss.backward()
-
-        self.assertEqual(loss.shape, [])
-        self.assertEqual(input.grad.shape, [5, 3])
-
-        input = paddle.rand([5, 3, 2, 4])
-        input.stop_gradient = False
-        log_softmax = paddle.nn.LogSoftmax(axis=1)
-        log_out = log_softmax(input)
-        label = paddle.randint(0, 3, [5, 2, 4], "int64")
-
-        loss = paddle.nn.functional.nll_loss(log_out, label)
-        loss.backward()
-
-        self.assertEqual(loss.shape, [])
-        self.assertEqual(input.grad.shape, [5, 3, 2, 4])
-
-
-class TestLossAPIStatic(unittest.TestCase):
-    def setUp(self):
-        paddle.enable_static()
-        self.exe = paddle.static.Executor()
-
-    @prog_scope()
-    def test_sigmoid_focal_loss(self):
-        logit = paddle.rand([2, 3])
-        logit.stop_gradient = False
-
-        label = paddle.randint(0, 1, [2, 3]).astype('float32')
-        label.stop_gradient = False
-
-        fg_num_0 = paddle.full([], 2.0)
-        fg_num_1 = paddle.full([1], 2.0)
-
-        out0 = F.sigmoid_focal_loss(
-            logit, label, normalizer=fg_num_0, reduction='mean'
-        )
-        out1 = F.sigmoid_focal_loss(
-            logit, label, normalizer=fg_num_1, reduction='mean'
-        )
-        paddle.static.append_backward(out0.sum())
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(
-            prog, fetch_list=[out0, out1, out0.grad_name, logit.grad_name]
-        )
-        np.testing.assert_allclose(res[0], res[1])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, ())
-        self.assertEqual(res[2].shape, ())
-        self.assertEqual(res[3].shape, (2, 3))
-
-    @prog_scope()
-    def test_cross_entropy(self):
-        input = paddle.rand([3, 5])
-        input.stop_gradient = False
-        label = paddle.randint(0, 5, shape=[3])
-        label.stop_gradient = False
-
-        loss = paddle.nn.functional.cross_entropy(
-            input, label, reduction='mean'
-        )
-        paddle.static.append_backward(loss)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[loss, input.grad_name])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 5))
-
-    @prog_scope()
-    def test_l1_loss(self):
-        input = paddle.rand([3, 5])
-        input.stop_gradient = False
-        label = paddle.rand([3, 5])
-
-        loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
-        paddle.static.append_backward(loss)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[loss, input.grad_name])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (3, 5))
-
-    @prog_scope()
-    def test_nll_loss(self):
-        input = paddle.rand([5, 3])
-        input.stop_gradient = False
-        log_softmax = paddle.nn.LogSoftmax(axis=1)
-        log_out = log_softmax(input)
-
-        label = paddle.randint(0, 3, shape=[5])
-        label.stop_gradient = False
-
-        loss = paddle.nn.functional.nll_loss(log_out, label)
-        paddle.static.append_backward(loss)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[loss, input.grad_name])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (5, 3))
-
-        input = paddle.rand([5, 3, 2, 4])
-        input.stop_gradient = False
-        log_softmax = paddle.nn.LogSoftmax(axis=1)
-        log_out = log_softmax(input)
-
-        label = paddle.randint(0, 3, shape=[5, 2, 4])
-        label.stop_gradient = False
-
-        loss = paddle.nn.functional.nll_loss(log_out, label)
-        paddle.static.append_backward(loss)
-
-        prog = paddle.static.default_main_program()
-        res = self.exe.run(prog, fetch_list=[loss, input.grad_name])
-        self.assertEqual(res[0].shape, ())
-        self.assertEqual(res[1].shape, (5, 3, 2, 4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_zero_dim_unary_api.py b/test/legacy_test/test_zero_dim_unary_api.py
new file mode 100644
index 0000000000000..39c2bbca41068
--- /dev/null
+++ b/test/legacy_test/test_zero_dim_unary_api.py
@@ -0,0 +1,185 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note:
+# 0D Tensor indicates that the tensor's dimension is 0
+# 0D Tensor's shape is always [], numel is 1
+# which can be created by paddle.rand([])
+
+import unittest
+
+import paddle
+from paddle.pir_utils import test_with_pir_api
+
+unary_api_list = [
+    paddle.nn.functional.elu,
+    paddle.nn.functional.rrelu,
+    paddle.frac,
+    paddle.sgn,
+    paddle.nan_to_num,
+    paddle.i0,
+    paddle.i0e,
+    paddle.i1,
+    paddle.i1e,
+    paddle.nn.functional.gelu,
+    paddle.nn.functional.hardsigmoid,
+    paddle.nn.functional.hardswish,
+    paddle.nn.functional.hardshrink,
+    paddle.nn.functional.hardtanh,
+    paddle.nn.functional.leaky_relu,
+    paddle.nn.functional.log_sigmoid,
+    paddle.nn.functional.relu,
+    paddle.nn.functional.relu6,
+    paddle.nn.functional.sigmoid,
+    paddle.nn.functional.softplus,
+    paddle.nn.functional.softshrink,
+    paddle.nn.functional.softsign,
+    paddle.nn.functional.swish,
+    paddle.nn.functional.tanhshrink,
+    paddle.nn.functional.thresholded_relu,
+    paddle.stanh,
+    paddle.nn.functional.celu,
+    paddle.nn.functional.selu,
+    paddle.nn.functional.mish,
+    paddle.nn.functional.silu,
+    paddle.nn.functional.tanh,
+    paddle.nn.functional.dropout,
+    paddle.cosh,
+    paddle.sinh,
+    paddle.abs,
+    paddle.acos,
+    paddle.asin,
+    paddle.atan,
+    paddle.ceil,
+    paddle.cos,
+    paddle.exp,
+    paddle.floor,
+    paddle.log,
+    paddle.log1p,
+    paddle.reciprocal,
+    paddle.round,
+    paddle.sin,
+    paddle.sqrt,
+    paddle.square,
+    paddle.tanh,
+    paddle.acosh,
+    paddle.asinh,
+    paddle.atanh,
+    paddle.expm1,
+    paddle.log10,
+    paddle.log2,
+    paddle.tan,
+    paddle.erf,
+    paddle.erfinv,
+    paddle.rsqrt,
+    paddle.sign,
+    paddle.deg2rad,
+    paddle.rad2deg,
+    paddle.neg,
+    paddle.logit,
+    paddle.trunc,
+    paddle.digamma,
+    paddle.lgamma,
+    paddle.poisson,
+    paddle.bernoulli,
+    paddle.nn.functional.softmax,
+    paddle.nn.functional.log_softmax,
+    paddle.nn.functional.gumbel_softmax,
+    paddle.nn.functional.alpha_dropout,
+]
+
+inplace_unary_api_list = [
+    paddle.nn.functional.relu_,
+    paddle.nn.functional.tanh_,
+    paddle.tensor.sigmoid_,
+    paddle.tensor.ceil_,
+    paddle.tensor.floor_,
+    paddle.tensor.reciprocal_,
+    paddle.tensor.exp_,
+    paddle.tensor.sqrt_,
+]
+
+
+# Use to test zero-dim in unary API.
+class TestUnaryAPI(unittest.TestCase):
+    def test_dygraph_unary(self):
+        paddle.disable_static()
+        for api in unary_api_list:
+            x = paddle.rand([])
+            x.stop_gradient = False
+            out = api(x)
+
+            out.retain_grads()
+            out.backward()
+
+            self.assertEqual(x.shape, [])
+            self.assertEqual(out.shape, [])
+            if x.grad is not None:
+                self.assertEqual(x.grad.shape, [])
+                self.assertEqual(out.grad.shape, [])
+
+        for api in inplace_unary_api_list:
+            x = paddle.rand([])
+            out = api(x)
+            self.assertEqual(x.shape, [])
+            self.assertEqual(out.shape, [])
+
+        paddle.enable_static()
+
+    @test_with_pir_api
+    def test_static_unary(self):
+        paddle.enable_static()
+
+        for api in unary_api_list:
+            main_prog = paddle.static.Program()
+            block = main_prog.global_block()
+            exe = paddle.static.Executor()
+            with paddle.static.program_guard(
+                main_prog, paddle.static.Program()
+            ):
+                x = paddle.rand([])
+                x.stop_gradient = False
+                out = api(x)
+                fetch_list = [x, out]
+                grad_list = paddle.static.append_backward(
+                    out, parameter_list=fetch_list
+                )
+                fetch_list.extend(
+                    [
+                        _grad
+                        for _param, _grad in grad_list
+                        if isinstance(
+                            _grad,
+                            (paddle.pir.Value, paddle.base.framework.Variable),
+                        )
+                    ]
+                )
+
+                # 1) Test Program
+                res = exe.run(main_prog, fetch_list=fetch_list)
+                for item in res:
+                    self.assertEqual(item.shape, ())
+
+                # 2) Test CompiledProgram Program
+                if not paddle.framework.in_pir_mode():
+                    compile_prog = paddle.static.CompiledProgram(main_prog)
+                    res = exe.run(compile_prog, fetch_list=fetch_list)
+                    for item in res:
+                        self.assertEqual(item.shape, ())
+
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 3ab9fb83adfdc..f99f7c8cc58e7 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -143,7 +143,11 @@ disable_wingpu_cuda12_test="^test_cholesky_op$|\
 ^test_cuda_graph_static_mode$|\
 ^test_matrix_rank_op$|\
 ^test_sparse_pca_lowrank$|\
-^test_zero_dim_tensor$|\
+^test_zero_dim_no_backward_api$|\
+^test_zero_dim_sundry_dygraph_api$|\
+^test_zero_dim_sundry_static_api_part1$|\
+^test_zero_dim_sundry_static_api_part2$|\
+^test_zero_dim_sundry_static_api_part3$|\
 ^paddle_infer_api_copy_tensor_tester$|\
 ^cudnn_helper_test$|\
 ^test_analyzer_small_dam$|\

From 5875b9ea0d11a76a9fa4560243e91beae159f632 Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Mon, 11 Mar 2024 06:52:31 +0000
Subject: [PATCH 114/114] update

---
 paddle/cinn/hlir/framework/pir/trivial_op.cc | 66 ++++++++++++--------
 1 file changed, 39 insertions(+), 27 deletions(-)

diff --git a/paddle/cinn/hlir/framework/pir/trivial_op.cc b/paddle/cinn/hlir/framework/pir/trivial_op.cc
index 14e1ce86bd3c8..974bb9510dc13 100644
--- a/paddle/cinn/hlir/framework/pir/trivial_op.cc
+++ b/paddle/cinn/hlir/framework/pir/trivial_op.cc
@@ -339,6 +339,8 @@ ir::Expr TTFusion(ir::Expr upper, ir::Expr down) {
   const auto& replaced_tensor = upstream.GetOutputTensor();
   VLOG(4) << "connected tensor is:" << replaced_tensor;
   VLOG(4) << "store value is :" << downstream.GetStoreValue();
+  VLOG(4) << "upper :\n" << upper;
+  VLOG(4) << "down :\n" << down;
 
   TrivialOp fused(ir::ir_utils::IRCopy(downstream.GetFuncBody()));
   SequenceMutator(
@@ -350,7 +352,7 @@ ir::Expr TTFusion(ir::Expr upper, ir::Expr down) {
       });
 
   VLOG(4) << "After mutate, store_value is: " << fused.GetFuncBody();
-  VLOG(4) << "TTFusion end:" << fused.GetFuncBody();
+  VLOG(4) << "TTFusion end:\n" << fused.GetFuncBody();
   return fused.GetFuncBody();
 }
 
@@ -362,6 +364,9 @@ ir::Expr TRFusion(ir::Expr upper, ir::Expr down) {
   VLOG(4) << "connected tensor is:" << replaced_tensor;
   VLOG(4) << "store value is :" << downstream.GetStoreValue();
 
+  VLOG(4) << "upper :\n" << upper;
+  VLOG(4) << "down :\n" << down;
+
   ReduceOp fused(ir::ir_utils::IRCopy(downstream.GetFuncBody()));
   SequenceMutator(
       fused.GetEachTensorLoadExpr(replaced_tensor),
@@ -371,7 +376,7 @@ ir::Expr TRFusion(ir::Expr upper, ir::Expr down) {
             upstream, downstream_load_expr, downstream_body);
       });
 
-  VLOG(4) << "TRFusion end:" << fused.GetFuncBody();
+  VLOG(4) << "TRFusion end:\n" << fused.GetFuncBody();
   return fused.GetFuncBody();
 }
 
@@ -483,6 +488,8 @@ struct FusionGraph {
         exit_nodes_.emplace(cur_node);
       }
     }
+
+    VLOG(4) << "FusionGraph Created, fusion node size: " << all_fusion_nodes_.size();
   }
 
   ~FusionGraph(){
@@ -510,11 +517,10 @@ struct FusionGraph {
   void fuse_trivial_node(){
     FusionNode* upstream;
     while((upstream = find_trivial_node()) != nullptr){
-      while(!upstream->downstream.empty()){
-        const auto& pair_data = *(upstream->downstream.begin());
+      std::unordered_map<FusionNode*, ::pir::Value> fusion_candidate = upstream->downstream;
+      upstream->downstream.clear();
+      for (const auto& pair_data : fusion_candidate) {
         FusionNode* downstream = pair_data.first;
-        upstream->downstream.erase(downstream);
-
         CHECK(downstream->op_compute_body.size() == 1);
 
         FusionNode* new_node;
@@ -666,29 +672,35 @@ std::vector<ir::Expr> TrivialOpFusion(
     const std::vector<::pir::Operation*>& ops,
     const std::vector<ir::Expr>& op_compute_bodies) {
   trivial_fusion_detail::FusionGraph graph = trivial_fusion_detail::FusionGraph(ops, op_compute_bodies);
-  return graph.DoFusion();
+  auto output = graph.DoFusion();
+  VLOG(4) << "Fusion Result: output size is " << output.size();
+  for (const auto& expr : output){
+    VLOG(4) << expr;
+  }
+  return output;
 }
 
-std::vector<ir::Expr> TrivialOpFusion_(
-    const std::vector<::pir::Operation*>& ops,
-    const std::vector<ir::Expr>& op_compute_bodies) {
-  const auto& op_patterns = trivial_fusion_detail::GetOpPatternKindVector(ops);
-  trivial_fusion_detail::CheckFusionInputValid(op_compute_bodies, op_patterns);
-  const auto& before_fused_nodes =
-      trivial_fusion_detail::ConstructFusionNodeElementwisely(op_compute_bodies,
-                                                              op_patterns);
-
-  auto fused_nodes_each_step = before_fused_nodes;
-  while (const auto& fusable_upstream =
-             trivial_fusion_detail::FindUpstreamNodeUsedByOthers(
-                 fused_nodes_each_step)) {
-    fused_nodes_each_step = trivial_fusion_detail::FuseSingleUpstreamNode(
-        fusable_upstream.value(), fused_nodes_each_step);
-  }
-
-  return trivial_fusion_detail::ExtractBodiesFromFusionNodes(
-      fused_nodes_each_step);
-}
+// std::vector<ir::Expr> TrivialOpFusion_(
+//     const std::vector<::pir::Operation*>& ops,
+//     const std::vector<ir::Expr>& op_compute_bodies) {
+//   const auto& op_patterns = trivial_fusion_detail::GetOpPatternKindVector(ops);
+//   trivial_fusion_detail::CheckFusionInputValid(op_compute_bodies, op_patterns);
+//   const auto& before_fused_nodes =
+//       trivial_fusion_detail::ConstructFusionNodeElementwisely(op_compute_bodies,
+//                                                               op_patterns);
+
+//   auto fused_nodes_each_step = before_fused_nodes;
+//   while (const auto& fusable_upstream =
+//              trivial_fusion_detail::FindUpstreamNodeUsedByOthers(
+//                  fused_nodes_each_step)) {
+//     fused_nodes_each_step = trivial_fusion_detail::FuseSingleUpstreamNode(
+//         fusable_upstream.value(), fused_nodes_each_step);
+//   }
+
+//   return trivial_fusion_detail::ExtractBodiesFromFusionNodes(
+//       fused_nodes_each_step);
+// }
+
 }  // namespace pir
 }  // namespace framework
 }  // namespace hlir