From 113cd81e09cad42680ce910d3131767e48b6f83a Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Wed, 20 Sep 2023 19:00:29 +0800
Subject: [PATCH 001/115] [phi] support pir run in dy2static AST mode. (#57357)

* [NewIR] Support Ir run program node (#56791)

* support build model in python

* fix ci bugs

* fix ci bugs

* fix compile bugs

* fix ci bugs

* add infermeta for data

* fix ci bugs

* fix ci bugs

* fix ci bugs

* fix bugs when run ir program mutiple times

* perfect code

* frontend demo debugging

* support program split and go into run program node.

* simple run the dy2static test in newir_api mode.

* remove frame.proto changes

* merge

* fix ir-run-program-node

* fix some code

* fix output error

* fix some errors

* fix

* fix

* fix

* fix conflict

* fix files

* fix some errors

* merge and solve conflict

---------

Co-authored-by: YuanRisheng <yuanrisheng@baidu.com>

* new pr

* fix

* fix

* fix segment error

* fix

* add dependences

* fix

* fix link error.

* fix some cmake problem

* fix

* fix

* fix dependecy

* fix

* fix

* fix circle dependence

* fix

* fix

* fix rocm

* fix

* add python library

* fix cmake

* merge

* fix

* fix

* fix conflict

---------

Co-authored-by: YuanRisheng <yuanrisheng@baidu.com>
---
 .../eager/to_static/run_program_op_func.h     |  105 ++
 .../eager/to_static/run_program_op_node.h     |  600 ++++++++-
 paddle/fluid/framework/CMakeLists.txt         |   15 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |    2 +-
 paddle/fluid/framework/ir/generate_pass.cc    |    8 +
 paddle/fluid/framework/op_desc.cc             |    9 +-
 paddle/fluid/framework/type_defs.h            |    6 +-
 paddle/fluid/prim/utils/static/CMakeLists.txt |    2 +-
 paddle/fluid/pybind/CMakeLists.txt            |    6 +-
 .../pybind/eager_legacy_custom_python_api.h   |   49 +-
 paddle/fluid/pybind/ir.cc                     |  335 +++++
 paddle/fluid/pybind/op_function_common.cc     |  123 ++
 paddle/fluid/pybind/op_function_common.h      |    7 +
 paddle/pir/core/op_result.h                   |    2 +-
 python/paddle/base/framework.py               |    4 +
 python/paddle/framework/__init__.py           |    2 +-
 python/paddle/jit/dy2static/function_spec.py  |   30 +
 .../jit/dy2static/newir_partial_program.py    | 1137 +++++++++++++++++
 .../jit/dy2static/program_translator.py       |  143 ++-
 python/paddle/tensor/math.py                  |    2 +
 test/ir/new_ir/CMakeLists.txt                 |    5 +-
 test/ir/new_ir/test_new_ir_to_static.py       |  100 ++
 22 files changed, 2667 insertions(+), 25 deletions(-)
 create mode 100644 python/paddle/jit/dy2static/newir_partial_program.py
 create mode 100644 test/ir/new_ir/test_new_ir_to_static.py

diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 289073095cc4f..f0ca7c1518b24 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -118,6 +118,7 @@ inline void run_program_ad_func(
     std::vector<paddle::Tensor*>& dout,                  // NOLINT
     const paddle::framework::AttributeMap& attrs) {
   // Prepare Autograd Meta
+  VLOG(2) << "start run run_program ad function.";
   auto deref_out = details::DereferenceTensors(out);
   std::vector<egr::AutogradMeta*> p_autograd_x =
       egr::EagerUtils::nullable_autograd_meta(x);
@@ -197,3 +198,107 @@ inline void run_program_ad_func(
     egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
   }
 }
+
+inline void newir_run_program_ad_func(
+    const std::vector<paddle::Tensor>& x,
+    const std::vector<paddle::Tensor>& params,
+    std::vector<paddle::Tensor*>& out,                   // NOLINT
+    std::vector<paddle::framework::Scope*>& step_scope,  // NOLINT
+    std::vector<paddle::Tensor*>& dout,                  // NOLINT
+    const paddle::framework::AttributeMap& attrs) {
+  // Prepare Autograd Meta
+  VLOG(2) << "start run newir run_program ad function.";
+  auto deref_out = details::DereferenceTensors(out);
+  std::vector<egr::AutogradMeta*> p_autograd_x =
+      egr::EagerUtils::nullable_autograd_meta(x);
+  std::vector<egr::AutogradMeta*> p_autograd_params =
+      egr::EagerUtils::nullable_autograd_meta(params);
+  std::vector<egr::AutogradMeta*> p_autograd_outs =
+      egr::EagerUtils::nullable_autograd_meta(deref_out);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, &p_autograd_x, &p_autograd_params);
+
+  // Create Middle Output for GradNode.
+  auto middle_size =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fm")).size();
+  auto output_size =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fo")).size();
+  auto middles = std::vector<paddle::Tensor*>();
+  std::shared_ptr<NewIRGradNodeRunProgram> grad_node;
+  VLOG(2) << "start run run_program with require_any_grad = "
+          << require_any_grad;
+
+  if (require_any_grad) {
+    // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
+    grad_node = std::make_shared<NewIRGradNodeRunProgram>(1, 2);
+    grad_node->GetMiddle().resize(middle_size);
+    grad_node->GetOutputs().resize(output_size);
+    for (size_t i = 0; i < middle_size; ++i) {
+      grad_node->GetMiddle()[i] =
+          paddle::Tensor(std::make_shared<phi::DenseTensor>());
+      middles.push_back(&grad_node->GetMiddle()[i]);
+    }
+    for (size_t i = 0; i < output_size; ++i) {
+      grad_node->GetOutputs()[i] = *out[i];
+    }
+  }
+
+  // Call forward function
+  // if require_any_grad is False, don't save any middle vars.
+  NewIRRunProgramAPI(
+      x, params, out, middles, step_scope, dout, require_any_grad, attrs);
+  if (require_any_grad) {
+    // auto x_names =
+    // PADDLE_GET_CONST(std::vector<std::string>, attrs.at("x_names"));
+
+    egr::EagerUtils::PassStopGradient(false, &p_autograd_outs);
+
+    // Set Attributes
+    grad_node->SetAttrMap(attrs);
+
+    // auto* forward_global_block = PADDLE_GET_CONST(
+    // paddle::framework::BlockDesc*, attrs.at("forward_global_block"));
+    // auto* backward_global_block = PADDLE_GET_CONST(
+    // paddle::framework::BlockDesc*, attrs.at("backward_global_block"));
+    // Clear unused x vars
+    // auto filter_x =
+    // filter_unused_input_var_in_backward(x, x_names, backward_global_block);
+    // Set TensorWrappers
+    grad_node->SetFwdX(x);
+    // Clear unused out vars
+    // clear_unused_out_var_in_backward(out, backward_global_block,
+    // step_scope[0]);
+
+    grad_node->SetFwdParams(params);
+    grad_node->SetStepScope(step_scope);  // just for set useable.
+
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    // NOTE(@xiongkun): Not every tensor in x(list of tensor) is required
+    // gradient. for example: x[1] is not used for output, the x[1] is ignored.
+
+    // TODO(@xiongkun): rewrite by new ir representation.
+    std::vector<const paddle::Tensor*> x_require_grad;
+    for (size_t i = 0; i < x.size(); ++i) {
+      x_require_grad.push_back(&x[i]);
+    }
+
+    grad_node->SetGradOutMeta(x_require_grad, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(params, /*slot id*/ 1);
+
+    // VLOG(2) << "clear_no_grad_edges.";
+    // clear_no_grad_edges_with_partial_block(params,
+    // forward_global_block,
+    // backward_global_block,
+    // grad_node.get(),
+    // [>slot id<] 1);
+
+    grad_node->SetGradInMeta(deref_out, 0);
+
+    egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
+
+    // Set History for output set current Grad Node for
+    egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
+  }
+}
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 2a4a24cea1272..ebab84ccd1521 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -25,6 +25,9 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/api/lib/data_transform.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/program.h"
 #include "paddle/pir/core/value.h"
 
@@ -195,6 +198,33 @@ static void ShareTensorsIntoScopeWithName(
   }
 }
 
+static auto GetNameFromValue(const ::pir::Block *block,
+                             const std::vector<::pir::Value> &values) {
+  // we use name here, later value is used directly.
+  std::unordered_map<::pir::Value, std::string> value2name;
+  for (auto *op : *block) {
+    std::string name;
+    if (op->name() == "pd_op.data") {
+      name =
+          op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
+      value2name[op->results()[0].Value::impl()] = name;
+    } else if (op->name() == "builtin.set_parameter") {
+      name = op->attributes()
+                 .at("parameter_name")
+                 .dyn_cast<pir::StrAttribute>()
+                 .AsString();
+      value2name[op->operand(0).source()] = name;
+    }
+  }
+  std::vector<std::string> names;
+  std::transform(
+      values.begin(),
+      values.end(),
+      std::back_inserter(names),
+      [&value2name](const ::pir::Value &v) { return value2name[v]; });
+  return names;
+}
+
 static void ShareTensorsFromScope(
     const std::vector<Tensor *> &tensors,
     const paddle::framework::BlockDesc &global_block,
@@ -236,6 +266,52 @@ static void ShareTensorsFromScope(
   }
 }
 
+static void ShareTensorsIntoScopeByValue(
+    const ::pir::Block *block,
+    const std::vector<Tensor> &tensors,
+    const std::vector<::pir::Value> &values,
+    paddle::framework::Scope *scope) {
+  auto names = GetNameFromValue(block, values);
+  ShareTensorsIntoScopeWithName(tensors, names, scope);
+}
+
+static void ShareTensorsFromScopeByValue(
+    const ::pir::Block *block,
+    const std::vector<Tensor *> &tensors,
+    const std::vector<::pir::Value> &values,
+    paddle::framework::Scope *scope) {
+  auto names = GetNameFromValue(block, values);
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto &name = names[i];
+    auto &value = values[i];
+    if (value.impl() == nullptr) {
+      // skip stop_gradient.
+      continue;
+    }
+    auto *var = scope->FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        paddle::platform::errors::NotFound("The output tensor %s is not in "
+                                           "RunProgram(Grad)Op'"
+                                           "s internal scope.",
+                                           name));
+    CheckOutputVarStatus(*var, *tensors[i]);
+    // share tensor
+    if (var->IsType<phi::DenseTensor>()) {
+      auto &src_tensor = var->Get<phi::DenseTensor>();
+      auto *dst_tensor = const_cast<phi::DenseTensor *>(
+          dynamic_cast<const phi::DenseTensor *>(tensors[i]->impl().get()));
+      VLOG(2) << "share " << name << " from scope";
+      *dst_tensor = src_tensor;
+    } else if (var->IsType<phi::SelectedRows>()) {
+      auto &src_tensor = var->Get<phi::SelectedRows>();
+      auto *dst_tensor = const_cast<phi::SelectedRows *>(
+          dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
+      *dst_tensor = src_tensor;
+    }
+  }
+}
+
 static void ShareTensorsFromScopeWithPartialBlock(
     const std::vector<Tensor *> &tensors,
     const paddle::framework::BlockDesc &forward_global_block,
@@ -329,8 +405,194 @@ static void GcScope(paddle::framework::Scope *scope) {
   delete garbages;  // free mem
 }
 
+template <class T>
+void print_collection(const T &t) {
+  VLOG(5) << "Print collection start :";
+  for (auto s : t) {
+    VLOG(5) << s;
+  }
+  VLOG(5) << "Print collection end.";
+}
+
 }  // namespace details
 
+inline void NewIRRunProgramAPI(
+    const std::vector<paddle::Tensor> &x,
+    const std::vector<paddle::Tensor> &params,
+    std::vector<paddle::Tensor *> &out,                   // NOLINT
+    std::vector<paddle::Tensor *> &middles,               // NOLINT
+    std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    std::vector<paddle::Tensor *> &dout,                  // NOLINT
+    bool require_any_grad,
+    const paddle::framework::AttributeMap &attrs) {
+  VLOG(2) << "RunProgramOpKernel Compute";
+  // In the original run_program OP, the default value of the is_test
+  // attribute is false, we should check if there is is_test parameter
+  // in attrs
+  auto is_test = false;
+  if (attrs.count("is_test")) {
+    is_test = PADDLE_GET_CONST(bool, attrs.at("is_test"));
+  }
+  int64_t program_id = PADDLE_GET_CONST(int64_t, attrs.at("program_id"));
+  auto place = egr::Controller::Instance().GetExpectedPlace();
+
+  // NOTE(chenweihang): In order not to add new variable type, use vector
+  // here. Originally, here can use scope directly.
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(),
+      1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  VLOG(2) << "RunProgramOp use interpretercore to execute program.";
+
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+
+  VLOG(4) << "global_inner_scope:" << global_inner_scope;
+
+  auto input_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fx"));
+  auto output_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fo"));
+  auto middle_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fm"));
+  auto param_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fp"));
+  // auto dout_names =
+  // PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fp"));
+
+  auto *forward_global_block =
+      PADDLE_GET_CONST(::pir::Block *, attrs.at("forward_global_block"));
+  auto *backward_global_block =
+      PADDLE_GET_CONST(::pir::Block *, attrs.at("backward_global_block"));
+
+  auto *forward_program =
+      forward_global_block->GetParentOp()->GetParentProgram();
+  auto *backward_program =
+      backward_global_block->GetParentOp()->GetParentProgram();
+
+  if (VLOG_IS_ON(4)) {
+    std::ostringstream print_stream;
+    forward_program->Print(print_stream);
+    print_stream << "\n";
+    backward_program->Print(print_stream);
+    VLOG(4) << print_stream.str();
+  }
+
+  VLOG(10) << is_test << program_id;
+
+  auto &interpretercore_info_cache =
+      paddle::framework::InterpreterCoreInfoCache::Instance();
+  std::shared_ptr<paddle::framework::InterpreterCore> interpreter_core =
+      nullptr;
+  if (!interpretercore_info_cache.Has(
+          program_id, global_inner_scope, /*is_grad=*/false)) {
+    paddle::platform::RecordEvent record_event(
+        "create_new_interpretercore",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    VLOG(2) << "No interpretercore cache, so create a new interpretercore "
+               "for program: "
+            << program_id;
+    // Step 1. share input_vars & parameters into scope
+    details::ShareTensorsIntoScopeByValue(
+        forward_global_block, x, input_values, global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(
+        forward_global_block, params, param_values, global_inner_scope);
+    // Step 2. create new interpretercore
+    auto kernel_forward_program =
+        paddle::dialect::PdOpLowerToKernelPass(forward_program, place);
+    interpreter_core = paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
+        std::move(kernel_forward_program),
+        place,
+        /*is_grad=*/false,
+        program_id,
+        global_inner_scope);
+    // Step 3. get all eager gc vars
+    // std::set<std::string> skip_eager_delete_vars =
+    // paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
+    // *backward_program);
+
+    // update interpretercore skip_gc_var
+    auto skip_names =
+        details::GetNameFromValue(forward_global_block, middle_values);
+    auto skip_names_set =
+        std::set<std::string>(skip_names.begin(), skip_names.end());
+    skip_names = details::GetNameFromValue(forward_global_block, output_values);
+    skip_names_set.insert(skip_names.begin(), skip_names.end());
+    details::print_collection(skip_names_set);
+    interpreter_core->SetSkipGcVars(skip_names_set);
+
+    // std::set<std::string> input_vars;
+    // input_vars.insert(input_names.begin(), input_names.end());
+    // interpreter_core->SetJitInputVars(input_vars);
+
+    // interpretercore_info_cache.UpdateSkipEagerDeleteVars(
+    // program_id, global_inner_scope, false, skip_eager_delete_vars);
+  } else {
+    paddle::platform::RecordEvent record_event(
+        "get_interpretercore_cahce",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    VLOG(2) << "Get interpretercore cache by program:" << program_id;
+    // Step 1. get cache interpretercore
+    auto &cached_value = interpretercore_info_cache.GetMutable(
+        program_id, global_inner_scope, /*is_grad=*/false);
+    interpreter_core = cached_value.core_;
+    // Step 2. update scope for cache interpretercore
+    details::ShareTensorsIntoScopeByValue(
+        forward_global_block, x, input_values, global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(
+        forward_global_block, params, param_values, global_inner_scope);
+    // TODO(xiongkun): new ir how to build scope.
+    // if (interpreter_core->GetVariableScope()->GetMutableScope() !=
+    // global_inner_scope) {
+    // details::BuildScopeByBlock(
+    // *interpreter_core.get(), *forward_global_block, global_inner_scope);
+    // interpreter_core->reset_scope(global_inner_scope);
+    //}
+  }
+
+  // interpretercore run
+  if (!forward_global_block->empty()) {
+    paddle::platform::RecordEvent record_event(
+        "interpreter_core_run",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    interpreter_core->Run({});
+  }
+
+  {
+    paddle::platform::RecordEvent record_event(
+        "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
+    // Get Output, and Middle Outputs
+    details::ShareTensorsFromScopeByValue(
+        forward_global_block, out, output_values, global_inner_scope);
+    details::ShareTensorsFromScopeByValue(
+        forward_global_block, middles, middle_values, global_inner_scope);
+
+    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+
+    if (is_test || !require_any_grad) {
+      VLOG(4) << "don't require any grad, set this scope can reused";
+      VLOG(4) << "is_test: " << is_test
+              << ", require_any_grad: " << require_any_grad;
+      global_inner_scope->SetCanReused(true);
+      details::GcScope(global_inner_scope);
+    } else {
+      VLOG(4) << "not test, set this scope can not reused";
+      global_inner_scope->SetCanReused(false);
+      details::GcScope(global_inner_scope);  // we can gc all the time, because
+                                             // we save the middles.
+    }
+  }
+
+#ifdef PADDLE_WITH_DNNL
+  if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
+#endif
+}
+
 inline void RunProgramAPI(
     const std::vector<paddle::Tensor> &x,
     const std::vector<paddle::Tensor> &params,
@@ -689,12 +951,164 @@ inline void RunProgramGradAPI(
   }
 }
 
+inline void NewIRRunProgramGradAPI(
+    const std::vector<paddle::Tensor> &x,
+    const std::vector<paddle::Tensor> &params,
+    const std::vector<paddle::Tensor> &out_grad,
+    const std::vector<paddle::Tensor> &middles,
+    const std::vector<paddle::Tensor> &out,
+    const std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    const paddle::framework::AttributeMap &attrs,
+    std::vector<paddle::Tensor *> &x_grad,      // NOLINT
+    std::vector<paddle::Tensor *> &params_grad  // NOLINT
+) {
+  // if all output vars are set to stop_gradient, grad op no need to executed
+  if (x_grad.empty() && params_grad.empty()) return;
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(),
+      1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+
+  int64_t program_id = PADDLE_GET_CONST(int64_t, attrs.at("program_id"));
+
+  auto place = egr::Controller::Instance().GetExpectedPlace();
+  VLOG(2) << "RunProgramGradOp use interpretercore to execute program.";
+
+  VLOG(4) << "global_inner_scope:" << global_inner_scope;
+
+  auto *backward_global_block =
+      PADDLE_GET_CONST(::pir::Block *, attrs.at("backward_global_block"));
+  auto *backward_program =
+      backward_global_block->GetParentOp()->GetParentProgram();
+
+  auto output_grad_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bo_g"));
+  auto forward_input_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bx"));
+  auto forward_middle_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bm"));
+  auto forward_output_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bo"));
+  auto x_grad_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bx_g"));
+  auto p_grad_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bp_g"));
+
+  auto &interpretercore_info_cache =
+      paddle::framework::InterpreterCoreInfoCache::Instance();
+  std::shared_ptr<paddle::framework::InterpreterCore> interpreter_core =
+      nullptr;
+  if (!interpretercore_info_cache.Has(
+          program_id, global_inner_scope, /*is_grad=*/true)) {
+    paddle::platform::RecordEvent record_event(
+        "create_new_interpretercore",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
+    // Step 1. share input_vars & parameters into scope
+    // x, param, middles, output_grads
+    details::ShareTensorsIntoScopeByValue(backward_global_block,
+                                          out_grad,
+                                          output_grad_values,
+                                          global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(
+        backward_global_block, x, forward_input_values, global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(backward_global_block,
+                                          middles,
+                                          forward_middle_values,
+                                          global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(
+        backward_global_block, out, forward_output_values, global_inner_scope);
+    auto kernel_backward_program =
+        paddle::dialect::PdOpLowerToKernelPass(backward_program, place);
+    interpreter_core = paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
+        std::move(kernel_backward_program),
+        place,
+        /*is_grad=*/true,
+        program_id,
+        global_inner_scope);
+    // share threadpool
+    // NOTE(zhiqiu): this only works interpreter_core is executed strictly
+    // after the related fwd_interpreter_core.
+    if (interpretercore_info_cache.Has(program_id, global_inner_scope, false)) {
+      auto fwd_interpreter_core =
+          interpretercore_info_cache
+              .GetMutable(program_id, global_inner_scope, /*is_grad=*/false)
+              .core_;
+      interpreter_core->ShareWorkQueueFrom(fwd_interpreter_core);
+      VLOG(4) << "Share workqueue from " << fwd_interpreter_core.get() << " to "
+              << interpreter_core.get();
+    }
+
+    // get all eager gc vars
+    std::set<std::string> skip_eager_delete_vars;
+    auto skip_names =
+        details::GetNameFromValue(backward_global_block, x_grad_values);
+    skip_eager_delete_vars.insert(skip_names.begin(), skip_names.end());
+    skip_names =
+        details::GetNameFromValue(backward_global_block, p_grad_values);
+    skip_eager_delete_vars.insert(skip_names.begin(), skip_names.end());
+    interpreter_core->SetSkipGcVars(skip_eager_delete_vars);
+    interpretercore_info_cache.UpdateSkipEagerDeleteVars(
+        program_id,
+        global_inner_scope,
+        /*is_grad=*/true,
+        skip_eager_delete_vars);
+    VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size();
+    details::print_collection(skip_eager_delete_vars);
+  } else {
+    paddle::platform::RecordEvent record_event(
+        "get_interpretercore_cahce",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    VLOG(2) << "Get interpretercore cahce by program:" << program_id;
+    auto &cached_value = interpretercore_info_cache.GetMutable(
+        program_id, global_inner_scope, /*is_grad=*/true);
+    interpreter_core = cached_value.core_;
+
+    // update scope (TODO: why share again)
+    // details::ShareTensorsIntoScope(out_grad, global_inner_scope);
+    // if (interpreter_core->GetVariableScope()->GetMutableScope() !=
+    // global_inner_scope) {
+    // details::BuildScopeByBlock(
+    // *interpreter_core.get(), *backward_global_block, global_inner_scope);
+    // interpreter_core->reset_scope(global_inner_scope);
+    //}
+  }
+
+  if (!backward_global_block->empty()) {
+    paddle::platform::RecordEvent record_event(
+        "interpreter_core_run",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    // Debug info: scope info when run end
+    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+    interpreter_core->Run({});
+  }
+
+  {
+    paddle::platform::RecordEvent record_event(
+        "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
+    // Step 4. get outputs
+    details::ShareTensorsFromScopeByValue(
+        backward_global_block, x_grad, x_grad_values, global_inner_scope);
+    details::ShareTensorsFromScopeByValue(
+        backward_global_block, params_grad, p_grad_values, global_inner_scope);
+    VLOG(4) << "after backward gc all vars";
+    global_inner_scope->SetCanReused(true);
+    details::GcScope(global_inner_scope);
+  }
+}
+
 class GradNodeRunProgram : public egr::GradNodeBase {
  public:
   GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
       : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
 
-  ~GradNodeRunProgram() {
+  ~GradNodeRunProgram() override {
     if (!executed_) {
       auto *out_scope_vec = &step_scope_;
       VLOG(4) << "~GradNodeRunProgram";
@@ -857,3 +1271,187 @@ class GradNodeRunProgram : public egr::GradNodeBase {
 
   bool executed_{false};
 };
+
+class NewIRGradNodeRunProgram : public egr::GradNodeBase {
+ public:
+  NewIRGradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+
+  ~NewIRGradNodeRunProgram() override {
+    if (!executed_) {
+      auto *out_scope_vec = &step_scope_;
+      VLOG(4) << "~GradNodeRunProgram";
+      // Normally out_scope_vec.size() == 1. for safty, we add for-loop here.
+      for (size_t i = 0; i < out_scope_vec->size(); ++i) {
+        paddle::framework::Scope *global_inner_scope = out_scope_vec->at(i);
+        global_inner_scope->SetCanReused(true);
+        details::GcScope(global_inner_scope);
+        VLOG(4) << "global_inner_scope SetCanReused";
+      }
+      middles_.clear();
+      outputs_.clear();
+    }
+  }
+  // Functor: perform backward computations
+  virtual paddle::small_vector<std::vector<paddle::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::Tensor>,
+                                  egr::kSlotSmallVectorSize> &grads,  // NOLINT
+             bool create_graph UNUSED,
+             bool is_new_grad UNUSED) override {
+    VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
+    paddle::small_vector<std::vector<paddle::Tensor>, egr::kSlotSmallVectorSize>
+        hooked_grads = NewIRGradNodeRunProgram::ApplyGradientHooks(grads);
+    PADDLE_ENFORCE_EQ(hooked_grads.size(),
+                      1,
+                      paddle::platform::errors::InvalidArgument(
+                          "The hooked_grads.size() of RunProgramGradOp should "
+                          "be equal to 1."));
+
+    std::vector<paddle::Tensor> x_grad;
+    std::vector<paddle::Tensor> params_grad;
+    std::vector<paddle::Tensor *> x_grad_ptr;
+    std::vector<paddle::Tensor *> params_grad_ptr;
+    {
+      paddle::platform::RecordEvent record_event(
+          "construct_grad_tensor",
+          paddle::platform::TracerEventType::UserDefined,
+          1);
+
+      egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&hooked_grads[0],
+                                                         this->InputMeta()[0]);
+      VLOG(3) << "hooked_grads[0].size() : " << hooked_grads[0].size();
+      ConstructXGradTensors(x_, &x_grad);
+      ConstructParamGradTensors(params_, &params_grad);
+      for (auto &i : x_grad) {
+        x_grad_ptr.emplace_back(&i);
+      }
+      for (auto &i : params_grad) {
+        if (i.defined()) {
+          params_grad_ptr.emplace_back(&i);
+        }
+      }
+    }
+
+    auto out_grad_values =
+        PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bo_g"));
+    PADDLE_ENFORCE_EQ(hooked_grads[0].size(),
+                      out_grad_values.size(),
+                      paddle::platform::errors::InvalidArgument(
+                          "The hooked_grads[0].size() and "
+                          "out_grad_values.size() should be equal."));
+
+    VLOG(1) << "Run Program Grad API start.";
+    NewIRRunProgramGradAPI(x_,
+                           params_,
+                           hooked_grads[0],
+                           middles_,
+                           outputs_,
+                           step_scope_,
+                           attrs_,
+                           x_grad_ptr,
+                           params_grad_ptr);
+    VLOG(1) << "Run Program Grad API end.";
+    VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
+
+    executed_ = true;
+    return {x_grad, params_grad};
+  }
+
+  void ClearTensorWrappers() override {
+    x_.clear();
+    params_.clear();
+    middles_.clear();
+    outputs_.clear();
+    SetIsTensorWrappersCleared(true);
+  }
+
+  // SetAttrMap
+  void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
+    attrs_ = attrs;
+  }
+
+  void SetFwdX(const std::vector<paddle::Tensor> &tensors) { x_ = tensors; }
+
+  std::vector<paddle::Tensor> &GetMiddle() { return middles_; }
+
+  std::vector<paddle::Tensor> &GetOutputs() { return outputs_; }
+
+  void SetFwdParams(const std::vector<paddle::Tensor> &tensors) {
+    params_ = tensors;
+  }
+
+  void SetStepScope(const std::vector<paddle::framework::Scope *> &scopes) {
+    step_scope_ = scopes;
+  }
+
+ protected:
+  void ConstructXGradTensors(const std::vector<paddle::Tensor> &x,
+                             std::vector<paddle::Tensor> *x_grad) {
+    auto x_grad_values =
+        PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bx_g"));
+    PADDLE_ENFORCE_EQ(
+        x.size(),
+        x_grad_values.size(),
+        paddle::platform::errors::InvalidArgument(
+            "The x.size() and x_grad_names.size() should be equal. "
+            "But received x.size() = %d, x_grad_names.size() = %d",
+            x.size(),
+            x_grad_values.size()));
+
+    // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
+    // such as: name, tensor type(DenseTensor or SelectedRows).
+    for (size_t i = 0; i < x.size(); i++) {
+      if (x[i].is_dense_tensor()) {
+        x_grad->emplace_back(std::make_shared<phi::DenseTensor>());
+      } else if (x[i].is_selected_rows()) {
+        x_grad->emplace_back(std::make_shared<phi::SelectedRows>());
+      }
+    }
+  }
+
+  void ConstructParamGradTensors(const std::vector<paddle::Tensor> &params,
+                                 std::vector<paddle::Tensor> *param_grads) {
+    auto p_grad_values =
+        PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bp_g"));
+    PADDLE_ENFORCE_EQ(params.size(),
+                      p_grad_values.size(),
+                      paddle::platform::errors::InvalidArgument(
+                          "The param.size() and "
+                          "param_grad_names.size() should be equal."));
+
+    for (size_t i = 0; i < params.size(); ++i) {
+      auto &p = params[i];
+      auto &p_grad = egr::EagerUtils::unsafe_autograd_meta(p)->Grad();
+      // In eager mode, the number of param_grad should be the same as
+      // param, so here an empty Tensor is added for the param with
+      // stop_gradient=True
+      if (!p_grad.defined()) {
+        param_grads->emplace_back();
+      } else if (p_grad.is_dense_tensor()) {
+        param_grads->emplace_back(std::make_shared<phi::DenseTensor>());
+      } else if (p_grad.is_selected_rows()) {
+        param_grads->emplace_back(std::make_shared<phi::SelectedRows>());
+      }
+    }
+  }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node = std::shared_ptr<NewIRGradNodeRunProgram>(
+        new NewIRGradNodeRunProgram(*this));
+    return copied_node;
+  }
+
+ private:
+  // TensorWrappers
+  std::vector<paddle::Tensor> x_;
+  std::vector<paddle::Tensor> params_;
+  std::vector<paddle::Tensor> middles_;
+  std::vector<paddle::Tensor> outputs_;
+  std::vector<paddle::framework::Scope *> step_scope_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attrs_;
+
+  bool executed_{false};
+};
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index f72d4ad182ddd..6d518b348d7d9 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -518,9 +518,22 @@ cc_test(
   SRCS version_test.cc
   DEPS version)
 
+add_library(proto_desc_base OBJECT var_desc.cc op_desc.cc block_desc.cc
+                                   program_desc.cc)
+add_dependencies(
+  proto_desc_base
+  attribute
+  ops_extra_info
+  shape_inference
+  op_info
+  glog
+  version
+  xxhash
+  phi)
+
 cc_library(
   proto_desc
-  SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc program_converter.cc
+  SRCS $<TARGET_OBJECTS:proto_desc_base> program_converter.cc
   DEPS attribute
        ops_extra_info
        shape_inference
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index e67dfa5adf910..92d316fdea0a3 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,7 +46,7 @@ cc_library(
 cc_library(
   op_compat_sensible_pass
   SRCS op_compat_sensible_pass.cc
-  DEPS graph_pattern_detector op_def_api pass)
+  DEPS graph_pattern_detector op_def_api pass pir_core)
 cc_library(
   subgraph_detector
   SRCS subgraph_detector.cc
diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc
index 17d2bdda56cb9..e0ab584ee3225 100644
--- a/paddle/fluid/framework/ir/generate_pass.cc
+++ b/paddle/fluid/framework/ir/generate_pass.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/ir/generate_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/utils/blank.h"
 
 namespace paddle {
@@ -47,6 +49,12 @@ class element_visitor {
   int index_;
 };
 
+template <>
+Attribute element_visitor::operator()(
+    const std::vector<::pir::Value>& attr UNUSED) const {
+  PADDLE_THROW(platform::errors::Unimplemented("Unimplemented operand."));
+}
+
 class operation_visitor {
  public:
   explicit operation_visitor(const proto::PassDesc::OperationType& type)
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index ab74b2691b062..a2eef6417870a 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/ops_extra_info.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/utils/blank.h"
 
 namespace paddle {
@@ -964,7 +966,12 @@ struct SetAttrDescVisitor {
   void operator()(const std::vector<bool> &v) const {
     VectorToRepeated(v, attr_->mutable_bools());
   }
-
+  void operator()(const std::vector<pir::Value> &v) const {
+    // just do nothing.
+  }
+  void operator()(const std::vector<pir::Block *> &v) const {
+    // just do nothing.
+  }
   void operator()(const std::vector<VarDesc *> &v) const {
     std::vector<std::string> var_names;
     for (auto var : v) {
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 961b7c1e663c0..4ad1bcb80c4bc 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 
 #include "paddle/phi/common/scalar.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/utils/blank.h"
 #include "paddle/utils/small_vector.h"
 #include "paddle/utils/variant.h"
@@ -62,7 +64,9 @@ using Attribute = paddle::variant<paddle::blank,
                                   std::vector<VarDesc*>,
                                   double,
                                   paddle::experimental::Scalar,
-                                  std::vector<paddle::experimental::Scalar>>;
+                                  std::vector<paddle::experimental::Scalar>,
+                                  ::pir::Block*,
+                                  std::vector<::pir::Value>>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
 using OpCreator =
diff --git a/paddle/fluid/prim/utils/static/CMakeLists.txt b/paddle/fluid/prim/utils/static/CMakeLists.txt
index 483c3eabc05d1..87b508dc699c0 100644
--- a/paddle/fluid/prim/utils/static/CMakeLists.txt
+++ b/paddle/fluid/prim/utils/static/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(
   static_global_utils
   SRCS static_global_utils.cc
-  DEPS proto_desc)
+  DEPS proto_desc_base)
 
 cc_library(
   static_tensor_operants
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6c0c0fb4f81f2..34ec9ca0523ee 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -274,8 +274,10 @@ if(WITH_PYTHON)
 
   add_executable(eager_legacy_op_function_generator
                  eager_legacy_op_function_generator.cc)
-  target_link_libraries(eager_legacy_op_function_generator
-                        ${OP_FUNCTION_GENERETOR_DEPS})
+  set(GENERATOR_DEPS ${PYBIND_DEPS})
+  list(REMOVE_DUPLICATES GENERATOR_DEPS)
+  list(REMOVE_ITEM GENERATOR_DEPS python)
+  target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS})
   if(NOT WIN32)
     add_executable(kernel_signature_generator kernel_signature_generator.cc)
     target_link_libraries(kernel_signature_generator
diff --git a/paddle/fluid/pybind/eager_legacy_custom_python_api.h b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
index 1deb20fbf9b88..1c40ce4275c42 100644
--- a/paddle/fluid/pybind/eager_legacy_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace pybind {
 
-static PyObject *eager_api_run_program(PyObject *self,
+static PyObject *eager_api_run_program(PyObject *self,  // TOREMOVE
                                        PyObject *args,
                                        PyObject *kwargs) {
   PyThreadState *tstate = nullptr;
@@ -61,11 +61,58 @@ static PyObject *eager_api_run_program(PyObject *self,
   }
 }
 
+static PyObject *newir_eager_api_run_program(PyObject *self,
+                                             PyObject *args,
+                                             PyObject *kwargs) {
+  PyThreadState *tstate = nullptr;
+  try {
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, true);
+    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, true);
+    auto OutScope =
+        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
+    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
+    framework::AttributeMap attrs;
+    // TODO(zengjinle): support CUDA Graph on eager mode
+    VLOG(1) << "Start NewIR ConstructAttrMapFromPyArgs";
+
+    ConstructAttrMapForRunProgram(
+        "run_program", args, 6, PyTuple_GET_SIZE(args), attrs);
+
+    VLOG(1) << "Finish NewIR ConstructAttrMapFromPyArgs";
+    tstate = PyEval_SaveThread();
+    newir_run_program_ad_func(X, Params, Out, OutScope, DOut, attrs);
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+    Py_RETURN_NONE;
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    std::ostringstream sout;
+    sout << exception.what();
+    sout << "  [operator < run_program > error]";
+    exception.set_error_str(sout.str());
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  } catch (...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+}
+
 static PyMethodDef CustomEagerMethods[] = {
     {"run_program",
      (PyCFunction)(void (*)(void))eager_api_run_program,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for run_program in dygraph."},
+    {"newir_run_program",
+     (PyCFunction)(void (*)(void))newir_eager_api_run_program,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for run_program in dygraph."},
     {nullptr, nullptr, 0, nullptr}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 359e1c9a8e59e..db3faebb1985b 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -32,6 +32,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/fluid/pir/transforms/inplace_pass.h"
 #include "paddle/phi/core/enforce.h"
@@ -142,6 +143,12 @@ void BindProgram(py::module *m) {
              self->Print(print_stream);
              return print_stream.str();
            })
+      .def("__repr__",
+           [](const std::shared_ptr<Program> &self) {
+             std::ostringstream print_stream;
+             self->Print(print_stream);
+             return print_stream.str();
+           })
       .def("parameters_num",
            [](const std::shared_ptr<Program> &self) {
              return self->parameters_num();
@@ -479,6 +486,14 @@ void BindOpResult(py::module *m) {
       .def("has_one_use", &Value::HasOneUse)
       .def("use_empty", &OpResult::use_empty)
       .def("type", &OpResult::type)
+      .def("is_dense_tensor_type",
+           [](OpResult &self) {
+             if (self.type().isa<DenseTensorType>()) {
+               return true;
+             } else {
+               return false;
+             }
+           })
       .def_property(
           "stop_gradient",
           [](OpResult &self) {
@@ -547,7 +562,327 @@ void BindType(py::module *m) {
       });
 }
 
+Operation *BuildOpFrom(
+    Operation *to_copy_op,
+    std::unordered_map<pir::Value, pir::Value> &value_map) {  // NOLINT
+  pir::OperationArgument to_create_argument(to_copy_op->info());
+  to_create_argument.attributes = to_copy_op->attributes();
+
+  auto origin_results = to_copy_op->results();
+  std::transform(origin_results.begin(),
+                 origin_results.end(),
+                 std::back_inserter(to_create_argument.output_types),
+                 [](const pir::OpResult &r) {
+                   // OpResult -> OpType
+                   return r.type();
+                 });
+
+  // transform by value_map dict.
+  auto origin_operands = to_copy_op->operands();
+  std::transform(origin_operands.begin(),
+                 origin_operands.end(),
+                 std::back_inserter(to_create_argument.inputs),
+                 [&value_map](const pir::OpOperand &operand) {
+                   // Operand -> OpResult
+                   return OpResult::dyn_cast_from(value_map[operand.source()]);
+                 });
+  auto *cloned_op = Operation::Create(std::move(to_create_argument));
+
+  // update the mapping of value_map. std::transform is a map(func, zip()).
+  std::vector<int> tmp;
+  std::transform(origin_results.begin(),
+                 origin_results.end(),
+                 cloned_op->results().begin(),
+                 std::back_inserter(tmp),  // NOLINT, just a placeholder.
+                 [&value_map](const OpResult &a, const OpResult &b) {  // NOLINT
+                   value_map[a.Value::impl()] = b.Value::impl();
+                   return 1;
+                 });
+  return cloned_op;
+}
+
+std::shared_ptr<Program> ProgramClone(const Program &program) {
+  // Limitation of this function:
+  // 1. don't support Parameters.
+  // 2. don't support Regions in operator.
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  auto cloned_program = std::make_shared<Program>(ctx);
+  std::unordered_map<pir::Value, pir::Value> value_map;
+  for (auto &op : *program.block()) {
+    auto *cloned_op = BuildOpFrom(op, value_map);
+    cloned_program->block()->push_back(cloned_op);
+  }
+  return cloned_program;
+}
+
+std::list<Operation *>::const_iterator list_offset(const Block *block,
+                                                   int start_idx) {
+  auto it = block->begin();
+  while (start_idx--) ++it;
+  return it;
+}
+
+template <class F>
+void range_block_do(const Block *block, std::vector<int> range, F fn) {
+  for (auto it = list_offset(block, range[0]);
+       it != list_offset(block, range[1]);
+       ++it) {
+    fn(*it);
+  }
+}
+
+std::vector<pir::Value> AnalysisMiddleVariable(
+    const Program &program,
+    const std::vector<pir::Value> &forward_inputs,
+    const std::vector<int> &forward_range,
+    const std::vector<int> &backward_range) {
+  std::vector<pir::Value> middle_values;
+
+  std::unordered_set<pir::Value> backward_inputs;
+  std::unordered_set<pir::Value> x_or_param(forward_inputs.begin(),
+                                            forward_inputs.end());
+  range_block_do(
+      program.block(), backward_range, [&backward_inputs](Operation *op) {
+        for (auto &t : op->operands()) {
+          backward_inputs.insert(t.source());
+        }
+      });
+
+  range_block_do(
+      program.block(),
+      forward_range,
+      [&middle_values, &backward_inputs, &x_or_param](Operation *op) {
+        for (auto &t : op->results()) {
+          auto v = Value(t.Value::impl());
+          if (backward_inputs.count(v) && !x_or_param.count(v))
+            middle_values.push_back(v);
+        }
+      });
+  return middle_values;
+}
+
+void mapping_value(const std::vector<pir::Value> &origin,
+                   const std::unordered_map<pir::Value, pir::Value> &value_map,
+                   std::vector<pir::Value> &out) {  // NOLINT
+  std::transform(origin.begin(),
+                 origin.end(),
+                 std::back_inserter(out),
+                 [&value_map](const pir::Value &v) {
+                   if (v.impl() == nullptr) return Value(nullptr);
+                   return value_map.at(v);
+                 });
+}
+
+using SplitedProgram = std::vector<std::shared_ptr<Program>>;
+using SplitedAttribute = std::map<std::string, std::vector<pir::Value>>;
+using SplitedResult = std::pair<SplitedProgram, SplitedAttribute>;
+
+pir::OpResult FakeOpResult() {
+  // create a fake opresults to simplify `ForwardBackwardSplit`.
+  return pir::OpResult(nullptr);
+}
+
+SplitedResult ForwardBackwardSplit(
+    const Program &program,
+    const std::vector<pir::OpResult> &op_result_forward_inputs,
+    const std::vector<pir::OpResult> &op_result_forward_outputs,
+    const std::vector<pir::OpResult> &op_result_forward_inputs_grads,
+    const std::vector<pir::OpResult> &op_result_forward_outputs_grads,
+    const std::vector<int> &forward_range,
+    const std::vector<int> &backward_range) {
+  // transform opresult -> value
+  VLOG(1) << "Start Prepare data structures.";
+  std::vector<pir::Value> forward_inputs, forward_outputs, forward_inputs_grads,
+      forward_outputs_grads;
+
+  auto op_result_to_value = [](const pir::OpResult &r) {
+    if (r.impl() == nullptr) return Value(nullptr);
+    return Value(r.Value::impl());
+  };
+
+  std::transform(op_result_forward_inputs.begin(),
+                 op_result_forward_inputs.end(),
+                 std::back_inserter(forward_inputs),
+                 op_result_to_value);
+  std::transform(op_result_forward_outputs.begin(),
+                 op_result_forward_outputs.end(),
+                 std::back_inserter(forward_outputs),
+                 op_result_to_value);
+  std::transform(op_result_forward_inputs_grads.begin(),
+                 op_result_forward_inputs_grads.end(),
+                 std::back_inserter(forward_inputs_grads),
+                 op_result_to_value);
+  std::transform(op_result_forward_outputs_grads.begin(),
+                 op_result_forward_outputs_grads.end(),
+                 std::back_inserter(forward_outputs_grads),
+                 op_result_to_value);
+
+  std::vector<pir::Value> forward_in_out_values;
+  for (auto &v : std::vector<std::vector<pir::Value> *>(
+           {&forward_inputs, &forward_outputs})) {
+    forward_in_out_values.insert(
+        forward_in_out_values.end(), v->begin(), v->end());
+  }
+
+  std::vector<pir::Value> fx, fp, fm, fo, bx, bp, bm, bo_g, bx_g, bp_g, bo;
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  auto forward_program = std::make_shared<Program>(ctx);
+  auto backward_program = std::make_shared<Program>(ctx);
+  auto middle_values = AnalysisMiddleVariable(
+      program, forward_in_out_values, forward_range, backward_range);
+  std::unordered_map<pir::Value, pir::Value> forward_value_map;
+  std::unordered_map<pir::Value, pir::Value> backward_value_map;
+  pir::Builder backward_builder = pir::Builder(ctx, backward_program->block());
+
+  // forward program construct.
+  VLOG(1) << "Before Forward Construct.";
+  range_block_do(program.block(),
+                 forward_range,
+                 [&forward_value_map, &forward_program](Operation *op) {
+                   auto *cloned_op = BuildOpFrom(op, forward_value_map);
+                   forward_program->block()->push_back(cloned_op);
+                 });
+  VLOG(1) << "After Forward Construct.";
+
+  // backward program construc.
+  // Step1. insert data op for inputs_values and middle_values
+  int counter = 0;
+  auto create_data_fn =
+      [&backward_builder, &backward_value_map, &counter](const pir::Value &v) {
+        if (v.impl() == nullptr) {
+          return;
+        }
+        auto value_type = v.type().dyn_cast<DenseTensorType>();
+        auto dtype = paddle::dialect::TransToPhiDataType(value_type.dtype());
+        auto shape = phi::vectorize(value_type.dims());
+        auto place = phi::Place();
+
+        paddle::dialect::DataOp op =
+            backward_builder.Build<paddle::dialect::DataOp>(
+                std::string("input_") + std::to_string(counter),
+                shape,
+                dtype,
+                place);
+        counter += 1;
+        backward_value_map[v] = op->results()[0].Value::impl();
+      };
+
+  auto create_output_fn_forward = [&ctx,
+                                   &forward_value_map,
+                                   &counter,
+                                   &forward_program](const pir::Value &v) {
+    if (v.impl() == nullptr) {
+      return;
+    }
+    auto op_info = ctx->GetRegisteredOpInfo(pir::SetParameterOp::name());
+    pir::AttributeMap attribute_map = {
+        {"parameter_name",
+         pir::StrAttribute::get(
+             ctx, std::string("output_") + std::to_string(counter))},
+    };
+    pir::Operation *operation =
+        pir::Operation::Create({OpResult::dyn_cast_from(forward_value_map[v])},
+                               attribute_map,
+                               {},
+                               op_info);
+    forward_program->block()->push_back(operation);
+    counter += 1;
+  };
+
+  auto create_output_fn_backward = [&ctx,
+                                    &backward_value_map,
+                                    &counter,
+                                    &backward_program](const pir::Value &v) {
+    if (v.impl() == nullptr) {
+      return;
+    }
+    auto op_info = ctx->GetRegisteredOpInfo(pir::SetParameterOp::name());
+    pir::AttributeMap attribute_map = {
+        {"parameter_name",
+         pir::StrAttribute::get(
+             ctx, std::string("output_") + std::to_string(counter))},
+    };
+    pir::Operation *operation = pir::Operation::Create(
+        {OpResult::dyn_cast_from(backward_value_map.at(v))},
+        attribute_map,
+        {},
+        op_info);
+    backward_program->block()->push_back(operation);
+    counter += 1;
+  };
+
+  counter = 0;
+  std::for_each(forward_outputs.begin(), forward_outputs.end(), create_data_fn);
+  std::for_each(forward_inputs.begin(), forward_inputs.end(), create_data_fn);
+  std::for_each(middle_values.begin(), middle_values.end(), create_data_fn);
+  std::for_each(forward_outputs_grads.begin(),
+                forward_outputs_grads.end(),
+                create_data_fn);
+  VLOG(1) << "After create pd.data for backward program.";
+
+  counter = 0;
+  std::for_each(
+      middle_values.begin(), middle_values.end(), create_output_fn_forward);
+  std::for_each(
+      forward_outputs.begin(), forward_outputs.end(), create_output_fn_forward);
+
+  VLOG(1) << "After call create_output_fn";
+  // Step2. copy backward ops .
+  range_block_do(program.block(),
+                 backward_range,
+                 [&backward_value_map, &backward_program](Operation *op) {
+                   auto *cloned_op = BuildOpFrom(op, backward_value_map);
+                   backward_program->block()->push_back(cloned_op);
+                 });
+  VLOG(1) << "After call backward copy";
+  counter = 0;
+  std::for_each(forward_inputs_grads.begin(),
+                forward_inputs_grads.end(),
+                create_output_fn_backward);
+  // TODO(xiongkun): add forward parameter grads.
+
+  VLOG(1) << "forward_value_map.size() is " << forward_value_map.size();
+  VLOG(1) << "backward_value_map.size() is " << backward_value_map.size();
+  std::ostringstream print_stream;
+  print_stream << "ForwardProgram is :\n";
+  forward_program->Print(print_stream);
+  print_stream << "BackwardProgram is:\n";
+  backward_program->Print(print_stream);
+  VLOG(1) << "Splited Program (fwd | bwd): \n" << print_stream.str();
+
+  // construct all attributes we needed.
+
+  mapping_value(middle_values, forward_value_map, fm);    // write 'fm'
+  mapping_value(middle_values, backward_value_map, bm);   // write 'bm'
+  mapping_value(forward_inputs, forward_value_map, fx);   // write 'fx'
+  mapping_value(forward_inputs, backward_value_map, bx);  // write 'bx'
+  mapping_value(forward_outputs, forward_value_map, fo);  // write 'fo'
+  mapping_value(
+      forward_inputs_grads, backward_value_map, bx_g);  // write 'fx_g'
+  mapping_value(
+      forward_outputs_grads, backward_value_map, bo_g);    // write 'bo_g'
+  mapping_value(forward_outputs, backward_value_map, bo);  // write 'bo'
+
+  std::map<std::string, std::vector<pir::Value>> attr = {{"fx", fx},
+                                                         {"fp", fp},
+                                                         {"fm", fm},
+                                                         {"fo", fo},
+                                                         {"bx", bx},
+                                                         {"bp", bp},
+                                                         {"bm", bm},
+                                                         {"bo_g", bo_g},
+                                                         {"bx_g", bx_g},
+                                                         {"bp_g", bp_g},
+                                                         {"bo", bo}};
+  std::vector<std::shared_ptr<Program>> programs = {forward_program,
+                                                    backward_program};
+  return std::make_pair(programs, attr);
+}
+
 void BindUtils(pybind11::module *m) {
+  m->def("program_clone", ProgramClone);
+  m->def("program_split", ForwardBackwardSplit);
+  m->def("fake_op_result", FakeOpResult);
   m->def("set_global_program",
          [](Program *program) { APIBuilder::Instance().SetProgram(program); });
   m->def("set_insertion_point",
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 29c4c2fd0a7c5..366465e6b2984 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -34,6 +34,8 @@
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace pybind {
@@ -829,6 +831,54 @@ void CastPyArg2AttrBlock(PyObject* obj,
   attrs[key] = reinterpret_cast<paddle::framework::BlockDesc*&>(vh[0]);
 }
 
+void CastPyArg2AttrIRBlock(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key,
+                           const std::string& op_type,
+                           ssize_t arg_pos) {
+  VLOG(1) << "After Process pir::Block*";
+  ::pybind11::detail::instance* inst =
+      (::pybind11::detail::instance*)obj;  // NOLINT
+  void** vh = inst->simple_layout ? inst->simple_value_holder
+                                  : &inst->nonsimple.values_and_holders[0];
+  attrs[key] = reinterpret_cast<::pir::Block*&>(vh[0]);
+}
+
+void CastPyArg2AttrValues(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key,
+                          const std::string& op_type,
+                          ssize_t arg_pos) {
+  std::vector<::pir::Value> results;
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      // TODO(xiongkun): judge OpResult or Value;
+      item = PyList_GetItem(obj, i);
+      ::pybind11::detail::instance* inst =
+          (::pybind11::detail::instance*)item;  // NOLINT
+      void** vh = inst->simple_layout ? inst->simple_value_holder
+                                      : &inst->nonsimple.values_and_holders[0];
+      ::pir::OpResult* opresult = reinterpret_cast<::pir::OpResult*>(vh[0]);
+      if (opresult->impl() == nullptr) {
+        results.emplace_back(pir::Value(nullptr));
+      } else {
+        results.emplace_back(pir::Value(opresult->Value::impl()));
+      }
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "a list of int, float, complex, or bool, but got %s",
+        op_type,
+        arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+  attrs[key] = results;
+  VLOG(1) << "Pybind: Cast " << results.size() << " Value Finished.";
+}
+
 void ConstructAttrMapFromPyArgs(
     const std::string& op_type,
     PyObject* args,
@@ -847,6 +897,7 @@ void ConstructAttrMapFromPyArgs(
 
   PyObject* obj = nullptr;
   for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
+    VLOG(1) << "Start Process " << arg_pos;
     Py_ssize_t key_len;
     const char* key_ptr;
     obj = PyTuple_GET_ITEM(args, arg_pos);
@@ -862,6 +913,7 @@ void ConstructAttrMapFromPyArgs(
     }
 
     std::string key(key_ptr, (size_t)key_len);  // NOLINT
+    VLOG(1) << "Start Process " << key;
     auto iter = attr_type_map->find(key);
     if (iter == attr_type_map->end()) {
       continue;
@@ -921,6 +973,77 @@ void ConstructAttrMapFromPyArgs(
   }
 }
 
+void ConstructAttrMapForRunProgram(
+    const std::string& op_type,
+    PyObject* args,
+    ssize_t attr_start,
+    ssize_t attr_end,
+    paddle::framework::AttributeMap& attrs) {  // NOLINT
+  PADDLE_ENFORCE_EQ((attr_end - attr_start) % 2,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The number of arguments for attributes should be even "
+                        "but attr_start = %d, attr_end = %d.",
+                        attr_start,
+                        attr_end));
+
+  PyObject* obj = nullptr;
+  for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
+    VLOG(1) << "Start Process " << arg_pos;
+    Py_ssize_t key_len;
+    const char* key_ptr;
+    obj = PyTuple_GET_ITEM(args, arg_pos);
+    if (PyObject_CheckString(obj)) {
+      key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument (position %d) must be str, but got "
+          "%s",
+          op_type,
+          arg_pos,
+          ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+    }
+
+    std::string key(key_ptr, (size_t)key_len);  // NOLINT
+    VLOG(1) << "Start Process " << key;
+    obj = PyTuple_GET_ITEM(args, arg_pos + 1);
+
+    if (std::set<std::string>({"cuda_graph_capture_mode"}).count(key)) {
+      CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos);
+    } else if (std::set<std::string>({"global_block",
+                                      "forward_global_block",
+                                      "backward_global_block"})
+                   .count(key)) {
+      CastPyArg2AttrIRBlock(obj, attrs, key, op_type, arg_pos);
+    } else if (std::set<std::string>({"is_test", "use_interpretorcore"})
+                   .count(key)) {
+      CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos);
+    } else if (std::set<std::string>({"start_op_index",
+                                      "end_op_index",
+                                      "program_id",
+                                      "cuda_graph_pool_id"})
+                   .count(key)) {
+      CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos);
+    } else if (std::set<std::string>({"fx",
+                                      "fp",
+                                      "fm",
+                                      "fo",
+                                      "bx",
+                                      "bp",
+                                      "bm",
+                                      "bo_g",
+                                      "bx_g",
+                                      "bp_g",
+                                      "bo"})
+                   .count(key)) {
+      CastPyArg2AttrValues(obj, attrs, key, op_type, arg_pos);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s is not defined in this function.", key));  // NOLINT
+    }
+  }
+}
+
 unsigned long GetUnsignedLongFromArgs(  // NOLINT
     const std::string& op_type,
     const std::string& arg_name,
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index a3f4960bbd58b..2d02dd6fb784d 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -194,6 +194,13 @@ void ConstructAttrMapFromPyArgs(
     ssize_t attr_end,
     paddle::framework::AttributeMap& attrs);  // NOLINT
 
+void ConstructAttrMapForRunProgram(
+    const std::string& op_type,
+    PyObject* args,
+    ssize_t attr_start,
+    ssize_t attr_end,
+    paddle::framework::AttributeMap& attrs);  // NOLINT
+
 unsigned long GetUnsignedLongFromArgs(  // NOLINT
     const std::string& op_type,
     const std::string& arg_name,
diff --git a/paddle/pir/core/op_result.h b/paddle/pir/core/op_result.h
index 8860473fe3339..781ed93148103 100644
--- a/paddle/pir/core/op_result.h
+++ b/paddle/pir/core/op_result.h
@@ -32,6 +32,7 @@ class IR_API OpResult : public Value {
   Operation *owner() const;
   uint32_t index() const;
   bool operator==(const OpResult &other) const;
+  static OpResult dyn_cast_from(Value value);
 
  private:
   friend Operation;
@@ -39,7 +40,6 @@ class IR_API OpResult : public Value {
   // Access classof annd dyn_cast_from.
   friend Value;
   static bool classof(Value value);
-  static OpResult dyn_cast_from(Value value);
 };
 
 }  // namespace pir
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 7b873cfb0761f..0440af415a7d0 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -316,6 +316,10 @@ def in_pir_mode():
     return global_var._use_pir_api_ and not in_dygraph_mode()
 
 
+def use_pir_api():
+    return global_var._use_pir_api_
+
+
 def in_dynamic_or_pir_mode():
     """
 
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f584008871cdb..3708048e56d4a 100755
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -56,7 +56,7 @@
 from ..base.dygraph.base import enable_dygraph as disable_static  # noqa: F401
 from ..base.dygraph.base import disable_dygraph as enable_static  # noqa: F401
 from ..base.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
-from ..base.framework import in_pir_mode  # noqa: F401
+from ..base.framework import in_pir_mode, use_pir_api  # noqa: F401
 from ..base.framework import in_dynamic_or_pir_mode  # noqa: F401
 from ..base.framework import (
     _current_expected_place,
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index d34a5dc6288f6..e2966e4097d86 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -18,7 +18,9 @@
 import numpy as np
 
 import paddle
+import paddle.ir.core as ir_static
 from paddle.base import core
+from paddle.base.data_feeder import convert_dtype
 from paddle.base.dygraph.base import switch_to_static_graph
 from paddle.jit.translated_layer import TranslatedLayer
 from paddle.nn.layer import layers
@@ -170,6 +172,34 @@ def args_to_input_spec(self, args, kwargs):
 
         return args_with_spec, kwargs_with_spec
 
+    @switch_to_static_graph
+    def newir_to_static_inputs_with_spec(self, input_with_spec, main_program):
+        """
+        Constructs feed layer by inputs with InputSpec information for main program.
+
+        Args:
+            input_with_spec(tuple): input arguments by replacing argument with InputSpec.
+            main_program(Program): main program for inserting feed layer.
+        """
+        flat_input_spec = paddle.utils.flatten(input_with_spec)
+
+        inputs = []
+        with ir_static.program_guard(main_program):
+            for i, var_spec in enumerate(flat_input_spec):
+                if isinstance(var_spec, paddle.static.InputSpec):
+                    stop_gradient = getattr(var_spec, 'stop_gradient', False)
+                    feed_value = paddle.static.input.data(
+                        name=var_spec.name or "feed_%s" % i,
+                        shape=var_spec.shape,
+                        dtype=convert_dtype(var_spec.dtype),
+                    )
+                    feed_value.stop_gradient = stop_gradient
+                else:
+                    feed_value = var_spec
+                inputs.append(feed_value)
+
+        return paddle.utils.pack_sequence_as(input_with_spec, inputs)
+
     @switch_to_static_graph
     def to_static_inputs_with_spec(self, input_with_spec, main_program):
         """
diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py
new file mode 100644
index 0000000000000..83cb5eed92534
--- /dev/null
+++ b/python/paddle/jit/dy2static/newir_partial_program.py
@@ -0,0 +1,1137 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from copy import deepcopy
+
+import numpy as np
+
+import paddle
+import paddle.ir.core as ir_static
+from paddle import _legacy_C_ops
+from paddle.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard
+from paddle.autograd.ir_backward import grad
+from paddle.base import core, framework, program_guard
+from paddle.base.compiler import BuildStrategy
+from paddle.base.data_feeder import check_type, convert_dtype
+from paddle.base.dygraph.base import switch_to_static_graph
+from paddle.base.framework import _apply_pass
+from paddle.base.libpaddle.ir import OpResult, fake_op_result
+from paddle.framework import use_pir_api
+from paddle.optimizer.lr import LRScheduler
+
+from . import logging_utils
+from .utils import RETURN_NO_VALUE_MAGIC_NUM, backend_guard
+
+__all__ = []
+
+
+class NestSequence:
+    """
+    A wrapper class that easily to flatten and restore the nest structure of
+    given sequence.
+    """
+
+    def __init__(self, raw_input, need_check=False):
+        self.__raw_input = raw_input
+        self.__input_list = self.tolist()
+        self.__var_ids = self._get_var_ids()
+        self._check_non_variable(need_check)
+
+    def tolist(self):
+        """
+        Flattens the nested sequences into single list.
+        """
+        return paddle.utils.flatten(self.__raw_input)
+
+    def restore(self, value_list):
+        """
+        Restores the nested sequence from value list.
+        """
+        assert len(self.__input_list) == len(value_list)
+        return paddle.utils.pack_sequence_as(self.__raw_input, value_list)
+
+    def _get_var_ids(self):
+        var_ids = []
+        for idx, var in enumerate(self.__input_list):
+            if isinstance(var, (OpResult, core.eager.Tensor)):
+                var_ids.append(idx)
+
+        return var_ids
+
+    def _check_non_variable(self, need_check):
+        """
+        Raises warning if output of traced function contains non-tensor type values.
+        """
+        if need_check:
+            warning_types = set()
+            for var in self.__input_list:
+                if not isinstance(var, (framework.Variable, core.eager.Tensor)):
+                    warning_types.add(type(var))
+            if warning_types:
+                logging_utils.warn(
+                    "Output of traced function contains non-tensor type values: {}. "
+                    "Currently, We don't support to update them while training and will return "
+                    "what we first saw. Please try to return them as tensor.".format(
+                        list(warning_types)
+                    )
+                )
+
+    @property
+    def var_ids(self):
+        return self.__var_ids
+
+    def __getitem__(self, item):
+        return self.__input_list[item]
+
+
+class LazyInitialized:
+    """
+    Descriptor to implement lazy initialization of property.
+    """
+
+    def __init__(self, function):
+        self.function = function
+
+    def __get__(self, instance, cls):
+        val = self.function(instance)
+        setattr(instance, self.function.__name__, val)
+        return val
+
+
+class ProgramInfo:
+    """
+    A helper class to recoder Program information
+    """
+
+    def __init__(self):
+        self.op_size = {
+            'fp32': -1,
+            'amp': -1,
+            'fp16': -1,
+        }
+        self.programs = {}
+        self.mode = "infer"
+
+    def __call__(self, key, prog_creator):
+        """
+        Recoder infer program and op size.
+        """
+        assert key in ['fp32', 'amp', 'fp16']
+        if key not in self.programs:
+            infer_prog = prog_creator(is_infer_mode=True)
+            self.programs[key] = infer_prog
+            self.op_size[key] = infer_prog.desc.global_block().op_size()
+
+        return self.programs[key], self.op_size[key]
+
+
+class PartialProgramLayerHook:
+    def before_append_backward(self, forward_program):
+        ...
+
+    def after_append_backward(self, whole_program, backward_start_idx):
+        ...
+
+    def after_infer(self, infer_program):
+        ...
+
+
+class PartialProgramLayer:
+    """
+    PartialProgramLayer wraps all the ops from layers decorated by `@to_static`
+    and execute them as a static subgraph.
+
+    .. note::
+        **1. This is a very low level API. Users should not use this API
+             directly. Please use `partial_program_from(concrete_program)`
+             to create it.
+        **2. LoDTensorArray is not currently supported in the output.
+
+    Args:
+        main_program(Program): The main program that contains ops need to be executed.
+        inputs(list[Variable]): The input list of the decorated function by `@to_static`.
+        outputs(list[Variable]): The output list of the decorated function by `@to_static`.
+        parameters(list[Tensor]|None): All trainable parameters included in the program. Default None.
+
+    Returns:
+        Layer: A Layer object that run all ops internally in static graph mode.
+    """
+
+    def __init__(
+        self, main_program, inputs, outputs, parameters=None, **kwargs
+    ):
+        super().__init__()
+        self._inputs = NestSequence(inputs)
+        self._outputs = NestSequence(outputs, need_check=True)
+        self._params = parameters if parameters is not None else []
+
+        self._build_strategy = kwargs.get('build_strategy', BuildStrategy())
+        assert isinstance(self._build_strategy, BuildStrategy)
+
+        self._origin_main_program = self._verify_program(main_program)
+        self._cuda_graph_vec = self._create_cuda_graph_vec()
+        self._cuda_graph_capture_mode = ""
+        self._cuda_graph_pool_id = 0
+        # Set default mode to train
+        self.training = True
+        self._infer_info = ProgramInfo()
+        self._program_extra_info = {}
+
+        amp_dtype, custom_white_list, custom_black_list = None, None, None
+        tracer = framework._dygraph_tracer()
+        if tracer:
+            custom_white_list, custom_black_list = tracer._get_amp_op_list()
+            amp_dtype = tracer._amp_dtype
+        if amp_dtype is not None and amp_dtype in ['float16', 'bfloat16']:
+            # For AMP training
+            self._amp_list = (
+                paddle.static.amp.fp16_lists.AutoMixedPrecisionLists(
+                    custom_white_list=custom_white_list,
+                    custom_black_list=custom_black_list,
+                    dtype=amp_dtype,
+                )
+            )
+
+        # program_id -> list(scope)
+        self._scope_cache = {}
+        self._hooker = None
+        self._backend = kwargs.get('backend', None)
+        self._grad_var_names = {}
+
+    def __call__(self, inputs):
+        """
+        Execute static graph by Interpreter and Return dynamic Tensors.
+        """
+        in_vars, out_vars = self._prepare(inputs)
+        self._cast_fp16_if_pure_fp16(in_vars)
+        attrs = self._prepare_attributes()
+
+        # self._sync_lr_value_with_scheduler()
+
+        c_run_program_fn = None
+        if use_pir_api():
+            c_run_program_fn = _legacy_C_ops.newir_run_program
+        else:
+            c_run_program_fn = _legacy_C_ops.run_program
+        c_run_program_fn(
+            self._valid_vars(in_vars),
+            self._valid_vars(self._params),
+            self._valid_vars(out_vars),
+            self._create_scope_vec(
+                program_id=self.program_id, use_scope_cache=True
+            ),
+            self._double_grads,
+            self._cuda_graph_vec,
+            *attrs,
+        )
+        self._update_stop_gradient(out_vars)
+        restored_nest_out = self._restore_out(out_vars)
+        return self._remove_no_value(restored_nest_out)
+
+    def _sync_lr_value_with_scheduler(self):
+        """Update lr_var value with calculated by lr_scheduler."""
+        main_program = self._origin_main_program
+        if hasattr(main_program, 'lr_scheduler') and hasattr(
+            main_program, 'lr_var'
+        ):
+            lr_scheduler = main_program.lr_scheduler
+            lr_var = main_program.lr_var
+
+            assert isinstance(lr_scheduler, LRScheduler), "must be LRScheduler"
+            lr_scheduler = self._origin_main_program.lr_scheduler
+            lr_value = lr_scheduler()
+            data = np.array(lr_value).astype(convert_dtype(lr_var.dtype))
+            lr_var.set_value(data)
+
+    def set_hooker(self, hooker):
+        self._hooker = hooker
+
+    def _get_scope(self, program_id=None, use_scope_cache=False):
+        if use_scope_cache:
+            if program_id not in self._scope_cache:
+                scope = core.Scope()
+                self._scope_cache[program_id] = [scope]
+                return scope
+            else:
+                for scope in self._scope_cache[program_id]:
+                    if scope._can_reused:
+                        return scope
+                scope = core.Scope()
+                self._scope_cache[program_id].append(scope)
+                return scope
+        else:
+            return core.Scope()
+
+    @LazyInitialized
+    def _double_grads(self):
+        # TODO: check the affects.
+        return None
+
+    # whole
+    @switch_to_static_graph
+    def _create_program(self, is_infer_mode=False):
+        if is_infer_mode:
+            infer_program = self._origin_main_program.clone(
+                for_test=is_infer_mode
+            )
+            if self._hooker:
+                infer_program = self._hooker.after_infer(infer_program)
+            return infer_program
+        else:
+            train_program = self._append_backward_desc(
+                self._origin_main_program
+            )
+            # Note: Only set grad type once after initializing train program. So we put it here.
+            self._set_grad_type(self._params, train_program)
+            return train_program
+
+    @switch_to_static_graph
+    def _create_amp_program(self, is_infer_mode=False):
+        amp_program = self._origin_main_program.clone(for_test=is_infer_mode)
+        with program_guard(amp_program):
+            paddle.static.amp.fp16_utils.cast_model_to_fp16(
+                amp_program, self._amp_list, use_fp16_guard=False, level='O1'
+            )
+        if is_infer_mode:
+            if self._hooker:
+                amp_program = self._hooker.after_infer(amp_program)
+            return amp_program
+        else:
+            train_amp_program = self._append_backward_desc(amp_program)
+            self._set_grad_type(self._params, train_amp_program)
+            return train_amp_program
+
+    @switch_to_static_graph
+    def _create_pure_fp16_program(self, is_infer_mode=False):
+        pure_fp16_program = self._origin_main_program.clone(
+            for_test=is_infer_mode
+        )
+        with program_guard(pure_fp16_program):
+            paddle.static.amp.fp16_utils.cast_model_to_fp16(
+                pure_fp16_program, self._amp_list, use_fp16_guard=False
+            )
+
+        if is_infer_mode:
+            if self._hooker:
+                pure_fp16_program = self._hooker.after_infer(pure_fp16_program)
+            return pure_fp16_program
+        else:
+            train_pure_fp16_program = self._append_backward_desc(
+                pure_fp16_program
+            )
+            self._set_grad_type(self._params, train_pure_fp16_program)
+            return train_pure_fp16_program
+
+    @switch_to_static_graph
+    def _create_forward_backward_train_program(self):
+        whole_program = self._train_program
+        forward_end_op_index = self.get_forward_end_op_idx(whole_program)
+        assert forward_end_op_index >= 0
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )
+
+    @switch_to_static_graph
+    def _create_forward_backward_train_amp_program(self):
+        whole_program = self._train_amp_program
+        forward_end_op_index = self.get_forward_end_op_idx(whole_program)
+        assert forward_end_op_index >= 0
+
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )
+
+    @switch_to_static_graph
+    def _create_forward_backward_train_pure_fp16_program(self):
+        whole_program = self._train_pure_fp16_program
+        forward_end_op_index = self.get_forward_end_op_idx(whole_program)
+        assert forward_end_op_index >= 0
+
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )
+
+    @LazyInitialized
+    def _train_program(self):
+        return self._create_program()
+
+    @LazyInitialized
+    def _infer_program(self):
+        program, op_size = self._infer_info('fp32', self._create_program)
+        return self._build_infer_program(program, op_size)
+
+    @LazyInitialized
+    def _train_amp_program(self):
+        return self._create_amp_program()
+
+    @LazyInitialized
+    def _infer_amp_program(self):
+        program, op_size = self._infer_info('amp', self._create_amp_program)
+        return self._build_infer_program(program, op_size)
+
+    @LazyInitialized
+    def _train_pure_fp16_program(self):
+        return self._create_pure_fp16_program()
+
+    @LazyInitialized
+    def _infer_pure_fp16_program(self):
+        program, op_size = self._infer_info(
+            'fp16', self._create_pure_fp16_program
+        )
+        return self._build_infer_program(program, op_size)
+
+    @LazyInitialized
+    def _train_forward_backward_program(self):
+        program = self._create_forward_backward_train_program()
+        return program
+
+    @LazyInitialized
+    def _train_amp_forward_backward_program(self):
+        program = self._create_forward_backward_train_amp_program()
+        return program
+
+    @LazyInitialized
+    def _empty_backward_program_for_eval(self):
+        return paddle.static.Program()
+
+    @LazyInitialized
+    def _train_pure_fp16_forward_backward_program(self):
+        program = self._create_forward_backward_train_pure_fp16_program()
+        return program
+
+    @LazyInitialized
+    def _train_program_id(self):
+        program_id = paddle.utils._hash_with_id(self._train_program, self)
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
+        return program_id
+
+    @LazyInitialized
+    def _infer_program_id(self):
+        return paddle.utils._hash_with_id(self._infer_program, self)
+
+    @LazyInitialized
+    def _train_amp_program_id(self):
+        program_id = paddle.utils._hash_with_id(self._train_amp_program, self)
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
+        return program_id
+
+    @LazyInitialized
+    def _infer_amp_program_id(self):
+        return paddle.utils._hash_with_id(self._infer_amp_program, self)
+
+    @LazyInitialized
+    def _train_pure_fp16_program_id(self):
+        program_id = paddle.utils._hash_with_id(
+            self._train_pure_fp16_program, self
+        )
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
+        return program_id
+
+    @LazyInitialized
+    def _infer_pure_fp16_program_id(self):
+        return paddle.utils._hash_with_id(self._infer_pure_fp16_program, self)
+
+    def get_forward_end_op_idx(self, program):
+        return self._program_extra_info[
+            paddle.utils._hash_with_id(program, self)
+        ]['forward_end_op_idx']
+
+    def get_program_extra(self, program):
+        if (
+            paddle.utils._hash_with_id(program, self)
+            not in self._program_extra_info
+        ):
+            self._program_extra_info[
+                paddle.utils._hash_with_id(program, self)
+            ] = {}
+        return self._program_extra_info[
+            paddle.utils._hash_with_id(program, self)
+        ]
+
+    @property
+    def program(self):
+        """
+        Return current train or eval program.
+        """
+        if self.training:
+            return self.train_program
+        else:
+            return self.infer_program
+
+    @property
+    def program_id(self):
+        """
+        Return current train or eval program hash id.
+        """
+        if self.training:
+            if _in_amp_guard():
+                return self._train_amp_program_id
+            elif _in_pure_fp16_guard():
+                return self._train_pure_fp16_program_id
+            else:
+                return self._train_program_id
+        else:
+            if _in_amp_guard():
+                return self._infer_amp_program_id
+            elif _in_pure_fp16_guard():
+                return self._infer_pure_fp16_program_id
+            else:
+                return self._infer_program_id
+
+    @property
+    def train_program(self):
+        if _in_amp_guard():
+            return self._train_amp_program
+        elif _in_pure_fp16_guard():
+            return self._train_pure_fp16_program
+        else:
+            return self._train_program
+
+    @property
+    def infer_program(self):
+        if _in_amp_guard():
+            return self._infer_amp_program
+        elif _in_pure_fp16_guard():
+            return self._infer_pure_fp16_program
+        else:
+            return self._infer_program
+
+    @property
+    def forward_program(self):
+        if self.training:
+            if _in_amp_guard():
+                progs = self._train_amp_forward_backward_program
+            elif _in_pure_fp16_guard():
+                progs = self._train_pure_fp16_forward_backward_program
+            else:
+                progs = self._train_forward_backward_program
+            return progs[0]
+        else:
+            return self.infer_program
+
+    @property
+    def backward_program(self):
+        if self.training:
+            if _in_amp_guard():
+                progs = self._train_amp_forward_backward_program
+            elif _in_pure_fp16_guard():
+                progs = self._train_pure_fp16_forward_backward_program
+            else:
+                progs = self._train_forward_backward_program
+            return progs[1]
+        else:
+            """
+            Can't just return paddle.static.Program(), because self.backward_program is a property,
+            whenever we call this method, a tmp Program() object is created and is gc immediatly
+            after executed the following line in PartialProgramLayer.__call__.
+
+            >>> self.backward_program.desc.global_block(),
+
+            When we access RunProgramAPI, it's possible to get an invalid backward_program address.
+            """
+            return self._empty_backward_program_for_eval
+
+    def _verify_program(self, main_program):
+        """
+        Verify that the program parameter is initialized, prune some unused params,
+        and remove redundant op callstack.
+        """
+        # 1. Check all params from main program can be found in self._params
+        self._check_params_all_inited(main_program)
+        # 2. Prune the parameters not used anywhere in the program.
+        self._prune_unused_params(main_program)
+
+        return main_program
+
+    def prepare_gradient_aggregation(
+        self, start_idx, main_program, target_program
+    ):
+        """
+        Why we need add gradient aggregation operation ?
+        In some cases, if non leaf nodes are used as output, gradient overwriting will occur, such as
+        def forward(self, in):
+            x = 2 * in  # <---- x is a non-leaf node in program.
+            y = x + 3
+            return x, y
+
+        loss = forward(in)[0].sum()
+        loss.backward()  # <----- x@grad will be overwrited by elementwise_add_grad Op
+        """
+
+        def _need_aggregation(var):
+            """
+            if exist a op whose inputs is var, then return True
+            """
+            if not isinstance(var, framework.Variable) or var.type not in [
+                core.VarDesc.VarType.LOD_TENSOR,
+                core.VarDesc.VarType.SELECTED_ROWS,
+            ]:
+                return False
+            if var.dtype not in [paddle.float32, paddle.float64]:
+                return False
+            for op in main_program.global_block().ops:
+                for in_arg in op.input_arg_names:
+                    if in_arg == var.name:
+                        return True
+            return False
+
+        def _insert_aggregation_ops_for_var(target_program, var):
+            suffix = "@dy2static"
+            var_grad_name = var.grad_name
+            new_grad_name = var.name + suffix + "@GRAD"
+            finded_ops = list(
+                filter(
+                    lambda x: x[0] >= start_idx
+                    and any(
+                        out_arg == var_grad_name
+                        for out_arg in x[1].output_arg_names
+                    ),
+                    enumerate(target_program.global_block().ops),
+                )
+            )
+
+            # len(finded_ops) may equals zero when stop_gradient works.
+            # len(finded_ops) may > 1, because we may have fill_constant op.
+            if len(finded_ops) == 0:
+                return None
+            # step1: create a new var named var.name@GRAD
+            target_program.global_block().create_var(
+                name=new_grad_name,
+                type=var.type,
+                dtype=var.dtype,
+                shape=var.shape,
+            )
+            # step2: rename the var.name@GRAD to var.name@GRAD@dy2static
+            for idx, op in finded_ops:
+                op._rename_input(var_grad_name, new_grad_name)
+                op._rename_output(var_grad_name, new_grad_name)
+            # step3: insert sum op to aggregate the gradient.
+            #        var.name@GRAD = sum(var.name@dy2static@GRAD, var.name@GRAD)
+            target_program.global_block()._insert_op(
+                finded_ops[-1][0] + 1,
+                type='sum',
+                inputs={'X': [var_grad_name, new_grad_name]},
+                outputs={"Out": var_grad_name},
+            )
+            return None
+
+        to_processed_vars = list(
+            filter(_need_aggregation, self._outputs.tolist())
+        )
+        for _var in to_processed_vars:
+            _insert_aggregation_ops_for_var(target_program, _var)
+
+    @switch_to_static_graph
+    def _append_backward_desc(self, main_program):
+        program = main_program
+        # if self._hooker:
+        # program = self._hooker.before_append_backward(program)
+        targets = list(
+            filter(lambda x: isinstance(x, OpResult), self._outputs.tolist())
+        )
+        inputs = list(
+            filter(lambda x: isinstance(x, OpResult), self._inputs.tolist())
+        )
+        forward_end_idx = len(program.global_block().ops)
+        if targets:
+            with backend_guard(self._backend):
+                check_type(
+                    targets,
+                    'targets',
+                    (OpResult, list, tuple),
+                    'paddle.static.gradients',
+                )
+                with ir_static.program_guard(program, None):
+                    grad_info_map = grad(inputs=inputs, outputs=targets)
+
+                forward_outputs_grads = []
+                not_stop_gradient_num = 0
+                for out_op_result in self._outputs.tolist():
+                    if out_op_result.stop_gradient is True:
+                        forward_outputs_grads.append(None)
+                        continue
+                    opres = (
+                        program.global_block()
+                        .ops[forward_end_idx + 2 * not_stop_gradient_num + 1]
+                        .results()[0]
+                    )
+                    forward_outputs_grads.append(opres)
+                    not_stop_gradient_num += 1
+
+            # TODO: add later.
+            # if self._hooker:
+            # program, start_idx = self._hooker.after_append_backward(
+            # program, start_idx
+            # )
+
+            # TODO: add later
+            # self.prepare_gradient_aggregation(
+            # start_idx + 1, main_program, program
+            # )
+
+        mapping_op_result = (
+            lambda x: x if isinstance(x, OpResult) else fake_op_result()
+        )
+        hash_id = paddle.utils._hash_with_id(program, self)
+        extra_info = self._program_extra_info.get(hash_id, {})
+        extra_info['forward_end_op_idx'] = forward_end_idx
+        extra_info['forward_inputs_grads'] = list(
+            map(mapping_op_result, grad_info_map)
+        )
+        extra_info['forward_outputs_grads'] = list(
+            map(mapping_op_result, forward_outputs_grads)
+        )
+        self._program_extra_info[hash_id] = extra_info
+
+        return program
+
+    def _prune_unused_params(self, program):
+        """
+        Prune the parameters not used anywhere in the program.
+        The `@to_static` may only decorated a sub function which
+        contains some unused parameters created in `__init__`.
+        So prune these parameters to avoid unnecessary operations in
+        `run_program_op`.
+        """
+        required_params = []
+        for param in self._params:
+            found_param = False
+            for block in program.blocks:
+                for op in block.ops:
+                    if (
+                        param.name in op.input_arg_names
+                        or param.name in op.output_arg_names
+                    ):
+                        required_params.append(param)
+                        found_param = True
+                        break
+                if found_param:
+                    break
+
+        self._params = required_params
+
+    def _cast_fp16_if_pure_fp16(self, in_vars):
+        if _in_pure_fp16_guard():
+            for i, var in enumerate(in_vars):
+                name = var.name
+                if (
+                    self.program.global_block().has_var(name)
+                    and self.program.global_block().var(name).dtype
+                    == paddle.float16
+                ):
+                    in_vars[i] = var.astype('float16')
+                    in_vars[i].name = name
+
+    def _prepare_attributes(self):
+        attrs = [
+            'forward_global_block',
+            self.forward_program.global_block(),
+            'backward_global_block',
+            self.backward_program.global_block(),
+            'is_test',
+            not self.training,
+            'program_id',
+            self.program_id,
+        ]
+
+        for key, val in self.get_program_extra(self.forward_program)[
+            'program_attr'
+        ].items():
+            attrs.append(key)
+            attrs.append(val)
+
+        if self._cuda_graph_capture_mode:
+            attrs.extend(
+                (
+                    'cuda_graph_capture_mode',
+                    self._cuda_graph_capture_mode,
+                    'cuda_graph_pool_id',
+                    self._cuda_graph_pool_id,
+                )
+            )
+        return attrs
+
+    @switch_to_static_graph
+    def _build_infer_program(self, infer_program, forward_end_op_index):
+        forward_skip_vars = self._parse_skip_gc_vars(infer_program)
+        builded_infer_program = add_build_strategy_for(
+            infer_program,
+            0,
+            forward_end_op_index,
+            self._build_strategy,
+            forward_skip_vars,
+        )
+        self._apply_inplace_pass(builded_infer_program, None)
+        return builded_infer_program
+
+    @switch_to_static_graph
+    def _get_forward_backward_program_form(
+        self, whole_program, forward_end_op_index
+    ):
+        # NOTE(dev): We apply build_strategy for backward firstly to
+        # avoid skipping more gc variables.
+        forward_inputs_grads = self.get_program_extra(whole_program)[
+            'forward_inputs_grads'
+        ]
+        forward_inputs = self._inputs.tolist()
+        forward_outputs = self._outputs.tolist()
+        forward_outputs_grads = self.get_program_extra(whole_program)[
+            'forward_outputs_grads'
+        ]
+        backward_start_op_index = forward_end_op_index + 2 * len(
+            list(filter(lambda r: r.stop_gradient is False, self._outputs))
+        )
+        backward_end_op_index = len(whole_program.global_block().ops)
+        # For Backward process in CINN, all param@GRAD shoule be skipped for GC, because
+        # they will be shared in scope and used by optimizer.
+
+        # TODO(xiongkun): consider cinn later.
+        # backward_skip_vars = self._parse_skip_gc_vars(
+        # whole_program
+        # ) + self._grad_var_names.get('param', [])
+
+        (
+            forward_program,
+            backward_program,
+        ), program_attr = paddle.base.libpaddle.ir.program_split(
+            whole_program,
+            forward_inputs,
+            forward_outputs,
+            forward_inputs_grads,
+            forward_outputs_grads,
+            [0, forward_end_op_index],
+            [backward_start_op_index, backward_end_op_index],
+        )
+        self.get_program_extra(forward_program)["program_attr"] = program_attr
+        return [forward_program, backward_program]
+
+    def _apply_inplace_pass(self, forward_program, backward_program):
+        attr_types = {
+            "use_cuda": "bool",
+            "mem_opt_skip_vars": "list[str]",
+            "for_partial_block": "bool",
+        }
+        empty_startup_program = paddle.static.Program()
+        use_cuda = True if core.is_compiled_with_cuda() else False
+        # skip data var
+        forward_mem_opt_skip_vars = self._parse_skip_gc_vars(
+            forward_program, backward_program
+        )
+        backward_mem_opt_skip_vars = self._parse_skip_gc_vars(forward_program)
+        if forward_program:
+            attrs = {
+                "use_cuda": use_cuda,
+                "mem_opt_skip_vars": forward_mem_opt_skip_vars,
+                "for_partial_block": True,
+            }
+            if not os.getenv("FLAGS_enable_new_ir_in_executor"):
+                _apply_pass(
+                    forward_program,
+                    empty_startup_program,
+                    "buffer_shared_inplace_pass",
+                    attrs,
+                    attr_types,
+                )
+        if backward_program:
+            attrs = {
+                "use_cuda": use_cuda,
+                "mem_opt_skip_vars": backward_mem_opt_skip_vars,
+                "for_partial_block": True,
+            }
+            if not os.getenv("FLAGS_enable_new_ir_in_executor"):
+                _apply_pass(
+                    backward_program,
+                    empty_startup_program,
+                    "buffer_shared_inplace_pass",
+                    attrs,
+                    attr_types,
+                )
+
+    @LazyInitialized
+    def _inout_var_names(self):
+        """
+        Returns Variable Names from self._inputs and self.outputs
+        """
+        var_names = []
+        for var in self._inputs:
+            if isinstance(var, paddle.base.framework.Variable):
+                var_names.append(var.desc.name())
+        for var in self._outputs:
+            if isinstance(var, paddle.base.framework.Variable):
+                var_names.append(var.desc.name())
+        return var_names
+
+    def _parse_skip_gc_vars(self, program, backward_program=None):
+        """
+        Parse variables that need to skip GC after execute it.
+        If specify backward_program, it will keep the variables used in backward.
+        """
+        # skip data var, DO NOT ignore this deepcopy
+        skip_vars = deepcopy(self._inout_var_names)
+        for var_name, var in program.global_block().vars.items():
+            if var.is_data:
+                skip_vars.append(var_name)
+
+        if backward_program:
+            for var_name in core.parse_safe_eager_deletion_skip_vars(
+                backward_program.desc, True
+            ):
+                skip_vars.append(var_name)
+        return skip_vars
+
+    def _prepare(self, inputs):
+        """
+        Prepare inputs, outputs, attrs.
+        """
+        assert isinstance(inputs, (tuple, list))
+        # Flatten inputs with nested structure into single list.
+        flatten_inputs = paddle.utils.flatten(inputs)
+        # Convert variable into Tensor and feed in training data.
+        input_vars = []
+        expected_place = framework._current_expected_place()
+        for i, value in enumerate(flatten_inputs):
+            if isinstance(value, np.ndarray):
+                var = None
+                var = core.eager.Tensor(
+                    value=value,
+                    persistable=False,
+                    place=expected_place,
+                    zero_copy=True,
+                )
+            elif isinstance(value, core.eager.Tensor):
+                # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
+                # into CUDAPlace when it's as input of multi Ops. so we move it in advance
+                # to avoid this problem.
+                if value.stop_gradient and not value.place._equals(
+                    expected_place
+                ):
+                    var = value._copy_to(expected_place, False)
+                    var.stop_gradient = True
+                else:
+                    var = value
+            else:
+                continue
+            input_vars.append(var)
+
+        # mapping from name(string) -> Tensor
+        out_tensor_map = {}
+
+        def create_out(var_id):
+            var = self._outputs[var_id]
+            assert isinstance(var, OpResult)
+
+            if id(var) in out_tensor_map:
+                return out_tensor_map[id(var)]
+
+            if var.is_dense_tensor_type():
+                tensor_type = paddle.dtype(7)  # LOD TENSOR
+            else:
+                tensor_type = paddle.dtype(8)  # SELECT ROW TENSOR
+
+            # TODO(xiongkun): more elegent way to do it.
+            ir_dtype_2_tensor_dtype = {
+                10: paddle.dtype(5),
+            }
+            out = core.eager.Tensor(
+                ir_dtype_2_tensor_dtype[int(var.dtype)],
+                var.shape,
+                "",
+                tensor_type,
+                False,
+            )
+            out.stop_gradient = var.stop_gradient
+            out_tensor_map[id(var)] = out
+            return out
+
+        # Create Tensor to receive output data.
+        out_vars = list(map(create_out, self._outputs.var_ids))
+        return input_vars, out_vars
+
+    def _create_scope_vec(self, program_id=None, use_scope_cache=False):
+        # Hold forward variables
+        tmp_scope_vec = None
+        inner_scope = self._get_scope(
+            program_id=program_id, use_scope_cache=use_scope_cache
+        )
+        tmp_scope_vec = [inner_scope]
+        return tmp_scope_vec
+
+    def _create_cuda_graph_vec(self):
+        var = core.eager.Tensor(
+            core.VarDesc.VarType.FP32,
+            [],
+            "cuda_graph",
+            core.VarDesc.VarType.RAW,
+            True,
+        )
+        var.stop_gradient = True
+        return var
+
+    def _update_stop_gradient(self, out_vars):
+        # Update stop_gradient for all outputs
+        def set_stop_gradient(var_id, eager_tensor):
+            var = self._outputs[var_id]
+            assert isinstance(var, OpResult)
+            eager_tensor.stop_gradient = var.stop_gradient
+
+        for idx, var in zip(self._outputs.var_ids, out_vars):
+            set_stop_gradient(idx, var)
+
+    def _restore_out(self, out_vars):
+        """
+        Restores same nested outputs by only replacing the Variable with Tensor.
+        """
+
+        flatten_outputs = self._outputs.tolist()
+        for i, idx in enumerate(self._outputs.var_ids):
+            flatten_outputs[idx] = out_vars[i]
+        outs = self._outputs.restore(flatten_outputs)
+        if outs is not None and len(outs) == 1:
+            outs = outs[0]
+
+        return outs
+
+    @switch_to_static_graph
+    def _clone_for_test(self, main_program):
+        return main_program.clone(for_test=True)
+
+    def _is_no_value(self, var):
+        if isinstance(var, core.eager.Tensor) and var.shape == [1]:
+            # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
+            if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
+                return True
+        return False
+
+    def _remove_no_value(self, out_vars):
+        """
+        Removes invalid value for various-length return statement
+        """
+        if isinstance(out_vars, core.eager.Tensor):
+            if self._is_no_value(out_vars):
+                return None
+            return out_vars
+        elif isinstance(out_vars, (tuple, list)):
+            if isinstance(out_vars, tuple):
+                res = tuple(
+                    var for var in out_vars if not self._is_no_value(var)
+                )
+            else:
+                # isinstance(out_vars, list)
+                res = [var for var in out_vars if not self._is_no_value(var)]
+
+            has_removed = len(out_vars) > len(res)
+            # len(out_vars) > len(res) means we have removed var. This is
+            # preventing out_vars is empty or just one element at the beginning
+            if len(res) == 0 and has_removed:
+                return None
+            elif len(res) == 1 and has_removed:
+                return res[0]
+            return res
+
+        return out_vars
+
+    def _set_grad_type(self, params, train_program):
+        # NOTE: if user set sparse gradient mode, the param's gradient
+        # will be SelectedRows, not LoDTensor. But tracer will just
+        # set param grad Tensor by forward Tensor(LoDTensor)
+        # If we don't change grad_var type here, RunProgramOp need
+        # transform SelectedRows to LoDTensor forcibly, it may not
+        # be user wanted result.
+        for param in params:
+            grad_name = param.name + core.grad_var_suffix()
+            grad_var = train_program.desc.global_block().find_var(
+                grad_name.encode()
+            )
+            # NOTE: cannot find var desc maybe no problem, such as in batch_norm
+            if grad_var is None:
+                continue
+            param._set_grad_type(grad_var.type())
+
+    def _remove_op_call_stack(self, main_program):
+        """
+        Remove op's python call stack with redundant low-level error messages related to
+        transforamtions to avoid confusing users.
+        """
+        assert isinstance(main_program, framework.Program)
+        for block in main_program.blocks:
+            for op in block.ops:
+                if op.has_attr("op_callstack"):
+                    op._remove_attr("op_callstack")
+
+        return main_program
+
+    def _check_params_all_inited(self, main_program):
+        """
+        Check all params from main program are already initialized, see details as follows:
+            1. all parameters in self._params should be type `framework.EagerParamBase` which are created in dygraph.
+            2. all parameters from transformed program can be found in self._params.
+               Because they share same data with EagerParamBase of original dygraph.
+        """
+        if not isinstance(self._params, (list, tuple)):
+            raise TypeError(
+                "Type of self._params in PartialProgramLayer should be list or tuple, but received %s."
+                % type(self._params)
+            )
+
+        param_and_buffer_names_set = set()
+        for i, var in enumerate(self._params):
+            # self._params constains parameters and buffers with persistable=True.
+            if not isinstance(var, core.eager.Tensor):
+                raise TypeError(
+                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.format(
+                        i, type(var)
+                    )
+                )
+            param_and_buffer_names_set.add(var.name)
+
+    def _valid_vars(self, vars):
+        return vars if vars else None
+
+
+def partial_program_from(concrete_program, from_method=False):
+    inputs = concrete_program.inputs
+
+    # NOTE(SigureMo): Remove the first arg `self` from method args.
+    if inputs and from_method:
+        inputs = inputs[1:]
+
+    return PartialProgramLayer(
+        concrete_program.main_program,
+        inputs,
+        concrete_program.outputs,
+        concrete_program.parameters,
+        **concrete_program.kwargs,
+    )
+
+
+@switch_to_static_graph
+def add_build_strategy_for(
+    program, start_op_index, end_op_index, build_strategy=None, skip_vars=None
+):
+    paddle.base.libpaddle.ir.program_split(
+        program,
+    )
+    if start_op_index < end_op_index:
+        pass
+    else:
+        # can't just create a new program, we need copy the vardesc.
+        builded_program = ir_static.Program()
+    return builded_program
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 55fc6d55d0e28..592665596cfef 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -20,6 +20,7 @@
 import warnings
 import weakref
 
+import paddle.ir.core as ir_static
 from paddle.base import core, framework
 from paddle.base.data_feeder import check_type
 from paddle.base.dygraph.base import (
@@ -29,7 +30,7 @@
 )
 from paddle.base.unique_name import UniqueNameGenerator
 from paddle.base.unique_name import guard as UniqueNameGuard
-from paddle.framework import in_dynamic_mode
+from paddle.framework import in_dynamic_mode, use_pir_api
 from paddle.nn.layer import layers
 from paddle.utils import flatten, gast
 
@@ -46,7 +47,7 @@
     create_and_update_origin_info_map,
     update_op_callstack_with_origin_info,
 )
-from .partial_program import PartialProgramLayerHook, partial_program_from
+from .partial_program import PartialProgramLayerHook
 from .utils import (
     ALREADY_D2S,
     NO_SHAPE_VAR_TYPE,
@@ -1156,6 +1157,106 @@ def __init__(
         self.name_generator = name_generator
         self.kwargs = kwargs
 
+    @staticmethod
+    @switch_to_static_graph
+    def newir_from_func_spec(
+        func_spec, input_spec, input_kwargs_spec, class_instance, **kwargs
+    ):
+        """
+        Builds the main_program with specialized inputs and returns outputs
+        of program as fetch_list.
+
+        Args:
+            func_spec(FunctionSpec): A FunctionSpec instance for decorated function.
+            input_spec(list[InputSpec]):
+        """
+        # verify the instance is initialized in imperative mode.
+        _verify_init_in_dynamic_mode(class_instance)
+
+        # Transforms dygraph function into static function and caches it.
+        dygraph_function = func_spec.dygraph_function
+        static_func = convert_to_static(dygraph_function)
+        # apply pre\post hook for outermost layer
+        hook_helper = HookHelper(
+            dygraph_function, class_instance, kwargs.get("with_hook", False)
+        )
+
+        main_program, startup_program = ir_static.Program(), ir_static.Program()
+        # Note: The random seed should be synchronized into cached program
+        # if set in `fluid.dygraph_guard` because some ops rely on it, such as
+        # `fluid.layers.dropout`.
+
+        # TODO: new ir has no random seed.
+        #  {{{
+        # main_program.random_seed = static.default_main_program().random_seed
+        # startup_program.random_seed = (
+        # framework.default_startup_program().random_seed
+        # ) }}}
+        with ir_static.program_guard(main_program, startup_program):
+            with _to_static_mode_guard_(is_to_static=True):
+                # 1. Adds `paddle.static.data` layers for input if needed
+                static_inputs = func_spec.newir_to_static_inputs_with_spec(
+                    input_spec, main_program
+                )
+                _kwargs = func_spec.newir_to_static_inputs_with_spec(
+                    input_kwargs_spec, main_program
+                )
+                if class_instance:
+                    static_inputs = tuple(
+                        [class_instance] + list(static_inputs)
+                    )
+
+                # 2. Builds program only once and returns the output Variables.
+                with param_guard(
+                    get_parameters(class_instance, False)
+                ), param_guard(get_buffers(class_instance, False)):
+                    try:
+                        # only for jit.save, do nothing while train and eval process
+                        inputs = hook_helper.apply_pre_hooks(static_inputs)
+                        if _kwargs:
+                            outputs = static_func(*inputs, **_kwargs)
+                        else:
+                            outputs = static_func(*inputs)
+                        outputs = hook_helper.apply_post_hooks(inputs, outputs)
+                    except BaseException as e:
+                        # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
+                        error.attach_error_data(e)
+                        error_data = getattr(e, error.ERROR_DATA, None)
+                        if error_data:
+                            error_data.raise_new_exception()
+                        raise
+
+                # 3. Gets all ParamBases and buffered VarBases in the function
+                all_parameters_and_buffers = (
+                    ProgramTranslator.get_instance()._params_recorder.pop(
+                        main_program
+                    )
+                )
+
+                if outputs is not None:
+                    need_wrap_into_list = (
+                        not isinstance(outputs, (tuple, list))
+                        or len(outputs) == 1
+                    )
+                    if need_wrap_into_list:
+                        outputs = [outputs]
+
+        # TODO(@xiongkun): support op call stack in new ir?
+        # main_program = update_op_callstack_with_origin_info(main_program)
+
+        new_name_generator = UniqueNameGenerator()
+        return ConcreteProgram(
+            inputs=static_inputs,
+            outputs=outputs,
+            parameters=all_parameters_and_buffers,
+            name_generator=new_name_generator,
+            function=dygraph_function,
+            main_program=main_program,
+            startup_program=startup_program,
+            **kwargs,
+        )
+
+    # TODO(@xiongkun): remove after new ir is switch
     @staticmethod
     @switch_to_static_graph
     def from_func_spec(
@@ -1393,13 +1494,22 @@ def _build_once(self, cache_key):
         # NOTE(xiongkun): Need a global FLAGS to enable/disable fallback
         enable_fallback = enable_prim
         try:
-            concrete_program = ConcreteProgram.from_func_spec(
-                func_spec=cache_key.function_spec,
-                input_spec=cache_key.input_args_with_spec,
-                input_kwargs_spec=cache_key.input_kwargs_with_spec,
-                class_instance=cache_key.class_instance,
-                **cache_key.kwargs,
-            )
+            if use_pir_api():
+                concrete_program = ConcreteProgram.newir_from_func_spec(
+                    func_spec=cache_key.function_spec,
+                    input_spec=cache_key.input_args_with_spec,
+                    input_kwargs_spec=cache_key.input_kwargs_with_spec,
+                    class_instance=cache_key.class_instance,
+                    **cache_key.kwargs,
+                )
+            else:
+                concrete_program = ConcreteProgram.from_func_spec(
+                    func_spec=cache_key.function_spec,
+                    input_spec=cache_key.input_args_with_spec,
+                    input_kwargs_spec=cache_key.input_kwargs_with_spec,
+                    class_instance=cache_key.class_instance,
+                    **cache_key.kwargs,
+                )
         except Exception as e:
             if enable_fallback:
                 warnings.warn(
@@ -1429,9 +1539,18 @@ def _build_once(self, cache_key):
                         )
                     )
 
-        partial_program = partial_program_from(
-            concrete_program, cache_key.class_instance is not None
-        )
+        if use_pir_api():
+            from .newir_partial_program import partial_program_from
+
+            partial_program = partial_program_from(
+                concrete_program, cache_key.class_instance is not None
+            )
+        else:  # TODO(new_ir): remove later.
+            from .partial_program import partial_program_from
+
+            partial_program = partial_program_from(
+                concrete_program, cache_key.class_instance is not None
+            )
         with backend_guard(backend):
             if core._is_fwd_prim_enabled():
                 partial_program.set_hooker(
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index f8513078eebfd..5cdd91b075426 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -690,6 +690,8 @@ def add(x, y, name=None):
     if in_dynamic_or_pir_mode():
         return _C_ops.add(x, y)
     else:
+        if in_pir_mode():
+            return paddle._ir_ops.add(x, y)
         return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
 
diff --git a/test/ir/new_ir/CMakeLists.txt b/test/ir/new_ir/CMakeLists.txt
index ca6d5367ffe88..e213eaba4c53c 100644
--- a/test/ir/new_ir/CMakeLists.txt
+++ b/test/ir/new_ir/CMakeLists.txt
@@ -4,8 +4,9 @@ file(
   "test_*.py")
 string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
 
-set(TEST_IR_SYSTEM_CASES test_build_model test_pd_inplace_pass
-                         test_symbol_overload test_stop_gradient)
+set(TEST_IR_SYSTEM_CASES
+    test_build_model test_pd_inplace_pass test_symbol_overload
+    test_new_ir_to_static test_stop_gradient)
 list(REMOVE_ITEM TEST_INTERP_CASES ${TEST_IR_SYSTEM_CASES})
 
 foreach(target ${TEST_INTERP_CASES})
diff --git a/test/ir/new_ir/test_new_ir_to_static.py b/test/ir/new_ir/test_new_ir_to_static.py
new file mode 100644
index 0000000000000..aadffa2cd0807
--- /dev/null
+++ b/test/ir/new_ir/test_new_ir_to_static.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestDy2staticNewIR(unittest.TestCase):
+    def test_basic_network(self):
+        def func(x):
+            out = paddle.mean(x)
+            return out
+
+        static_func = paddle.jit.to_static(func)
+        x = paddle.randn((3, 3))
+        y = paddle.randn((3, 3))
+        x.stop_gradient = False
+        y.stop_gradient = False
+        ans = func(x)
+        out = static_func(x)
+
+        np.testing.assert_allclose(
+            out.numpy(), ans.numpy(), rtol=1e-05, atol=1e-8
+        )
+
+    def test_basic_network_backward(self):
+        def func(x):
+            out = paddle.mean(x)
+            return out
+
+        # ==== dygraph computation ====
+        static_func = paddle.jit.to_static(func)
+        x = paddle.randn((3, 3))
+        y = paddle.randn((3, 3))
+        x.stop_gradient = False
+        y.stop_gradient = False
+        loss = func(x) * 2
+        loss.backward()
+        x_grad_ans = x.grad.numpy()
+        x.clear_gradient()
+
+        # ==== to static compuatation ====
+        out = static_func(x)
+        out = out * 2
+        out.backward()
+        st_grad = x.grad
+
+        np.testing.assert_allclose(
+            x_grad_ans, st_grad.numpy(), rtol=1e-05, atol=1e-8
+        )
+
+
+class TestDy2staticNewIR3(unittest.TestCase):
+    def test_complex_layer(self):
+        def output_pure_func(x, y):
+            outx = paddle.mean(x)
+            outy = paddle.mean(y)
+            outy.stop_gradient = True
+            return paddle.add(outx, outy), outy
+
+        def run_function(to_static=True):
+            import paddle
+
+            # 设置随机种子
+            paddle.seed(2023)
+            # 生成随机数
+            x = paddle.randn((10, 10))
+            y = paddle.randn((10, 10))
+            x.stop_gradient = False
+            y.stop_gradient = True
+            func = output_pure_func
+            if to_static:
+                func = paddle.jit.to_static(func)
+            y, y_mean = func(x, y)
+            loss = y.mean()
+            loss.backward()
+            return (y, x.grad)
+
+        for dy, st in zip(run_function(False), run_function(True)):
+            np.testing.assert_allclose(
+                dy.numpy(), st.numpy(), rtol=1e-05, atol=1e-8
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 3e8da403570fffd71f6f91b2a714a44fc6797c59 Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Wed, 20 Sep 2023 19:03:04 +0800
Subject: [PATCH 002/115] [PIR]Migrate dropout into pir (#57319)

---
 python/paddle/nn/functional/common.py | 19 ++++++++++++-------
 test/legacy_test/test_dropout_op.py   |  4 ++--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 7c1c16d8df191..5ef8e40d921b6 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -15,10 +15,15 @@
 import numpy
 
 import paddle
-from paddle import _C_ops
+from paddle import _C_ops, ir
 from paddle.base.layer_helper import LayerHelper
 from paddle.common_ops_import import Variable, default_main_program
-from paddle.framework import core, in_dynamic_mode, in_pir_mode
+from paddle.framework import (
+    core,
+    in_dynamic_mode,
+    in_dynamic_or_pir_mode,
+    in_pir_mode,
+)
 from paddle.tensor.creation import full
 
 from ...base.data_feeder import (
@@ -1090,7 +1095,7 @@ def dropout(
             [[0., 0., 6.],
              [0., 0., 0.]])
     """
-    if not isinstance(p, (float, int, Variable)):
+    if not isinstance(p, (float, int, Variable, ir.OpResult)):
         raise TypeError("p argument should be a number or Variable")
 
     if isinstance(p, (int, float)):
@@ -1112,7 +1117,7 @@ def dropout(
             'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
         )  # semantic transfer
 
-        if in_dynamic_mode():
+        if in_dynamic_or_pir_mode():
             if default_main_program().random_seed != 0:
                 seed = default_main_program().random_seed
 
@@ -1176,7 +1181,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
         dtype = x.dtype
         keep_prob = 1 - p
         if training:
-            if in_dynamic_mode() and p == 1.0:
+            if in_dynamic_or_pir_mode() and p == 1.0:
                 return paddle.scale(x, scale=0.0)
 
             scale_input = (
@@ -1187,7 +1192,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
 
             # get mask shape
             input_shape = x.shape
-            if not in_dynamic_mode():
+            if not in_dynamic_or_pir_mode():
                 input_shape_tensor = paddle.shape(x)
             drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
@@ -1203,7 +1208,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
                     )
                 )
             mask_shape = [1] * len(input_shape)
-            if not in_dynamic_mode():
+            if not in_dynamic_or_pir_mode():
                 for i in drop_axes:
                     mask_shape[i] = input_shape_tensor[i]
             else:
diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py
index ae707ee546e8c..b4eb567d40f9e 100644
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
@@ -81,11 +81,11 @@ def setUp(self):
         self.enable_check_static_comp = False
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
     def test_check_grad_normal(self):
         # Now in dy2st mode x_grad = [], so set check_prim=False
-        self.check_grad(['X'], 'Out', check_prim=False)
+        self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=True)
 
 
 class TestDropoutOp_ZeroDim(TestDropoutOp):

From 2a5aa2eea5aa307960db5336fe154bb95b4b0fb5 Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Wed, 20 Sep 2023 19:34:04 +0800
Subject: [PATCH 003/115] [Semi-Auto] Adapt reduction rule to phi (#57241)

* adapt reduction spmd rule to phi

* remove useless comments
---
 .../spmd_rules/reduction_spmd_rule.cc         | 191 ------------------
 .../spmd_rules/reduction_spmd_rule.h          |  46 -----
 .../auto_parallel/spmd_rules/rules.h          |  13 --
 paddle/phi/core/attribute.h                   |   5 +-
 .../auto_parallel/inferspmd_utils.cc          |  20 +-
 .../auto_parallel/inferspmd_utils.h           |  16 ++
 paddle/phi/infermeta/spmd_rules/reduction.cc  | 178 ++++++++++++++++
 paddle/phi/infermeta/spmd_rules/reduction.h   |  35 ++++
 paddle/phi/infermeta/spmd_rules/rules.h       |  49 +++++
 .../spmd_rules/test_reduction_rule.py         | 116 ++++++++---
 10 files changed, 389 insertions(+), 280 deletions(-)
 delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc
 delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h
 create mode 100644 paddle/phi/infermeta/spmd_rules/reduction.cc
 create mode 100644 paddle/phi/infermeta/spmd_rules/reduction.h

diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc
deleted file mode 100644
index 62940545e8845..0000000000000
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
-#include <algorithm>
-#include "paddle/phi/core/distributed/auto_parallel/utils.h"
-
-namespace paddle {
-namespace distributed {
-namespace auto_parallel {
-
-using phi::distributed::auto_parallel::str_join;
-
-std::string ReductionSPMDRule::GetOutputNotation(
-    int64_t input_ndim,
-    const std::string& input_axes,
-    const paddle::framework::AttributeMap& attrs) {
-  bool keep_dim = ExtractAttr<bool>("keep_dim", attrs);
-  std::vector<int64_t> reduce_dims =
-      ExtractAttr<std::vector<int64_t>>("axis", attrs);
-
-  // convert the negative dim value to normal dim value
-  for (auto& reduce_dim : reduce_dims) {
-    if (reduce_dim < 0) {
-      reduce_dim = input_ndim + reduce_dim;
-    }
-  }
-
-  std::string output_axes = "";
-  for (int64_t i = 0; i < input_ndim; i++) {
-    std::vector<int64_t>::iterator iter =
-        std::find(reduce_dims.begin(), reduce_dims.end(), i);
-    if (iter != reduce_dims.end()) {
-      // if i is reduce dim, the corresponding input axis
-      // will not be appended at the end of output_axes
-      if (keep_dim) {
-        output_axes.append(1, '1');
-      }
-    } else {
-      // otherwise, the corresponding input axis
-      // will be appended at the end of output_axes
-      output_axes.append(1, input_axes[i]);
-    }
-  }
-
-  return output_axes;
-}
-
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-ReductionSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
-                                const paddle::framework::AttributeMap& attrs) {
-  // step0: Verify Input Args Based on Elementwise Logic
-  int64_t ninputs = input_specs.size();
-  PADDLE_ENFORCE_EQ(
-      ninputs,
-      1,
-      phi::errors::InvalidArgument("The size of InputSpec in reduction must "
-                                   "be equal to 1, but got [%d].",
-                                   ninputs));
-  VerifySpecs(input_specs, "reduction");
-
-  // step1: Build Einsum Notation
-  // get einsum notation for input
-  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
-  int64_t ndim = input_specs[0].shape().size();
-  std::vector<std::string> input_axes_vec;
-  std::string input_axes = alphabet.substr(0, ndim);
-  input_axes_vec.emplace_back(input_axes);
-
-  // get einsum notation for output
-  std::string output_axes = GetOutputNotation(ndim, alphabet, attrs);
-
-  // step2: Sharding Propogation
-  // step2.1: merge input shardings
-  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
-  axes_sharding_info = GetAxesDimsMappingPair(input_axes_vec, input_specs);
-  std::unordered_map<std::string, int64_t> axis_to_dim_map =
-      ShardingMergeForTensors(axes_sharding_info);
-
-  // step2.2: infer output dimsmapping from merged input dimsmapping
-  std::vector<int64_t> output_dims_mapping =
-      GetDimsMappingForAxes(output_axes, axis_to_dim_map);
-
-  // initialize output dist_attr's process_mesh, batch_dim and dynamic dims with
-  // input dist_attr.
-  TensorDistAttr output_dist_attr =
-      CopyTensorDistAttrForOutput(input_specs[0].dist_attr());
-  output_dist_attr.set_dims_mapping(output_dims_mapping);
-
-  // step3: handle partial
-  // Step3.1 Output Partial
-  std::vector<int64_t> partial_on_dims =
-      ResoluteOutputPartialDimension(axis_to_dim_map, output_axes);
-  output_dist_attr.set_partial_status(
-      partial_on_dims /*, handle reduce_type in future  */);
-
-  std::vector<TensorDistAttr> output_dist_attrs;
-  output_dist_attrs.emplace_back(output_dist_attr);
-
-  // Step3.2  handle input tensor partial (TODO)
-  // If the op is a linear op, i.e. `linearity` is true, it supports
-  // the input to be partial. Otherwise, the input cannot be partial
-  // on reduced axes, we should reshard the input when the reduced
-  // axes are parital.
-  VLOG(4) << "ReductionSPMDRule InferForward: ";
-  for (int64_t i = 0; i < ninputs; i++) {
-    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
-            << str_join(input_specs[i].shape()) << "] "
-            << "src_dims_mapping: [" << str_join(input_specs[i].dims_mapping())
-            << "] "
-            << "dst_dims_mapping: [" << str_join(input_specs[i].dims_mapping())
-            << "]";
-  }
-  VLOG(4) << "Output dims_mapping: [" + str_join(output_dims_mapping) + "] "
-          << "partial_on_dims: [" + str_join(partial_on_dims) + "]\n\n";
-
-  return {{input_specs[0].dist_attr()}, output_dist_attrs};
-}
-
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-ReductionSPMDRule::InferBackward(
-    const std::vector<DistTensorSpec>& input_specs,
-    const std::vector<DistTensorSpec>& output_specs,
-    const paddle::framework::AttributeMap& attrs) {
-  // step0: Verify Input Args Based on Elementwise Logic
-  int64_t ninputs = input_specs.size();
-  int64_t noutputs = output_specs.size();
-  PADDLE_ENFORCE_EQ(
-      ninputs,
-      1,
-      phi::errors::InvalidArgument("The size of InputSpec in reduction must "
-                                   "be equal to 1, but got [%d].",
-                                   ninputs));
-  PADDLE_ENFORCE_EQ(
-      noutputs,
-      1,
-      phi::errors::InvalidArgument("The size of OutputSpec in reduction must "
-                                   "be equal to 1, but got [%d].",
-                                   ninputs));
-  VerifySpecs(output_specs, "reduction_backward");
-
-  // step1: Build Einsum Notation
-  // get einsum notation for input
-  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
-  int64_t ndim = input_specs[0].shape().size();
-  std::string input_axes = alphabet.substr(0, ndim);
-
-  // get einsum notation for output
-  std::string output_axes = GetOutputNotation(ndim, alphabet, attrs);
-
-  // step2: Sharding Propogation
-  std::unordered_map<std::string, int64_t> axis_to_dim_map =
-      ShardingMergeForTensors({{output_axes, output_specs[0].dims_mapping()}});
-
-  // step2.2: infer input dims mapping from output dims mapping
-  std::vector<int64_t> input_dims_mapping =
-      GetDimsMappingForAxes(input_axes, axis_to_dim_map, true);
-
-  // initialize input dist_attr's process_mesh, batch_dim and dynamic dims with
-  // input dist_attr.
-  TensorDistAttr input_dist_attr(input_specs[0].dist_attr());
-  input_dist_attr.set_dims_mapping(input_dims_mapping);
-
-  // step3: handle partial (TODO)
-
-  VLOG(4) << "ReductionSPMDRule InferBackward: ";
-  VLOG(4) << "Output shape:[" << str_join(output_specs[0].shape())
-          << "] dims_mapping: [" << str_join(output_specs[0].dims_mapping())
-          << "]";
-  VLOG(4) << "Input0: "
-          << " shape: [" << str_join(input_specs[0].shape()) << "] "
-          << "dims_mapping: [" << str_join(input_dist_attr.dims_mapping())
-          << "]";
-
-  return {{input_dist_attr}, {output_specs[0].dist_attr()}};
-}
-
-}  // namespace auto_parallel
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h
deleted file mode 100644
index 36e412b704927..0000000000000
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iterator>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
-
-namespace paddle {
-namespace distributed {
-namespace auto_parallel {
-
-class ReductionSPMDRule : public SPMDRuleBase {
- public:
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferForward(const std::vector<DistTensorSpec>& input_specs,
-               const paddle::framework::AttributeMap& attrs) override;
-
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferBackward(const std::vector<DistTensorSpec>& input_specs,
-                const std::vector<DistTensorSpec>& output_specs,
-                const paddle::framework::AttributeMap& attrs) override;
-
- private:
-  std::string GetOutputNotation(int64_t input_ndim,
-                                const std::string& input_axes,
-                                const paddle::framework::AttributeMap& attrs);
-};
-}  // namespace auto_parallel
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
index c876fa59a7034..54ae4325b8a15 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h"
@@ -30,18 +29,6 @@ namespace paddle {
 namespace distributed {
 namespace auto_parallel {
 
-// reduction rules
-REGISTER_SPMD_RULE(all, ReductionSPMDRule);
-REGISTER_SPMD_RULE(amax, ReductionSPMDRule);
-REGISTER_SPMD_RULE(amin, ReductionSPMDRule);
-REGISTER_SPMD_RULE(any, ReductionSPMDRule);
-REGISTER_SPMD_RULE(frobenius_norm, ReductionSPMDRule);
-REGISTER_SPMD_RULE(max, ReductionSPMDRule);
-REGISTER_SPMD_RULE(mean, ReductionSPMDRule);
-REGISTER_SPMD_RULE(min, ReductionSPMDRule);
-REGISTER_SPMD_RULE(prod, ReductionSPMDRule);
-REGISTER_SPMD_RULE(sum, ReductionSPMDRule);
-
 // layer_norm rule
 REGISTER_SPMD_RULE(layer_norm, LayerNormSPMDRule);
 
diff --git a/paddle/phi/core/attribute.h b/paddle/phi/core/attribute.h
index 40c66a669c9e8..6f032f4a5bd99 100644
--- a/paddle/phi/core/attribute.h
+++ b/paddle/phi/core/attribute.h
@@ -30,14 +30,17 @@ namespace phi {
 class Place;
 
 // NOTE: Add needed type in the future
+// Move vector<int> before vector<bool>, because when
+// vector<bool> is before vector<int>, a python integer
+// list will be converted to vector<bool> in error.
 using Attribute = paddle::variant<bool,
                                   int,
                                   int64_t,
                                   float,
                                   double,
                                   std::string,
-                                  std::vector<bool>,
                                   std::vector<int>,
+                                  std::vector<bool>,
                                   std::vector<int64_t>,
                                   std::vector<float>,
                                   std::vector<double>,
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
index a1895b6dfbd79..6e0c0f696fef4 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
@@ -54,7 +54,7 @@ AttrType InferSpmdContext::AttrAt(size_t idx) const {
 }
 
 template <>
-bool InferSpmdContext::AttrAt<bool>(size_t idx) const {
+bool InferSpmdContext::AttrAt(size_t idx) const {
   try {
     auto attr = attrs_.at(idx);
     if (attr.type() == typeid(int)) {
@@ -70,6 +70,24 @@ bool InferSpmdContext::AttrAt<bool>(size_t idx) const {
   }
 }
 
+template <>
+std::vector<int> InferSpmdContext::AttrAt(size_t idx) const {
+  try {
+    auto attr = attrs_.at(idx);
+    if (attr.type() == typeid(std::vector<bool>)) {
+      std::vector<bool> val = PADDLE_GET_CONST(std::vector<bool>, attr);
+      return std::vector<int>(val.begin(), val.end());
+    } else {
+      return paddle::get<std::vector<int>>(attr);
+    }
+  } catch (paddle::bad_variant_access const& e) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attribute cast error in InferSpmd Context, the input attr type is "
+        "`%s`, but the expected attribute type is `bool`.",
+        attrs_.at(idx).type().name()));
+  }
+}
+
 const Attribute& InferSpmdContext::AttrAt(size_t idx) const {
   return attrs_.at(idx);
 }
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
index 3896bfcd6a2fe..23b147a4bb3d7 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
@@ -138,8 +138,24 @@ struct InferSpmdFnImpl<Return (*)(Args...), infer_spmd_fn> {
     }                                                                     \
   }
 
+#define PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(attr_type) \
+  template <typename... Tail>                                                  \
+  struct InferSpmdFnCallHelper<const attr_type&, Tail...> {                    \
+    template <int in_idx, int attr_idx, typename... PreviousArgs>              \
+    static SpmdInfo Call(const InferSpmdContext& ctx,                          \
+                         PreviousArgs&... pargs) {                             \
+      attr_type arg = ctx.AttrAt<attr_type>(attr_idx);                         \
+      return InferSpmdFnCallHelper<Tail...>::template Call<in_idx,             \
+                                                           attr_idx + 1>(      \
+          ctx, pargs..., arg);                                                 \
+    }                                                                          \
+  }
+
   // TODO(chenweihang): support other attr type later as needed
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(std::vector<int>);
+  PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<int64_t>);
 
   /* End case */
   template <typename T>
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.cc b/paddle/phi/infermeta/spmd_rules/reduction.cc
new file mode 100644
index 0000000000000..24c90a1792341
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/reduction.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/reduction.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+
+using phi::distributed::auto_parallel::str_join;
+
+////////////////// Utils Functions //////////////////
+std::string GetOutputNotation(int input_ndim,
+                              const std::string& input_axes,
+                              std::vector<int> reduce_dims,
+                              bool keep_dim) {
+  // convert the negative dim value to normal dim value
+  for (auto& reduce_dim : reduce_dims) {
+    if (reduce_dim < 0) {
+      reduce_dim = input_ndim + reduce_dim;
+    }
+  }
+
+  std::string output_axes = "";
+  for (int i = 0; i < input_ndim; i++) {
+    std::vector<int>::iterator iter =
+        std::find(reduce_dims.begin(), reduce_dims.end(), i);
+    if (iter != reduce_dims.end()) {
+      // if i is reduce dim, the corresponding input axis
+      // will not be appended at the end of output_axes
+      if (keep_dim) {
+        output_axes.append(1, '1');
+      }
+    } else {
+      // otherwise, the corresponding input axis
+      // will be appended at the end of output_axes
+      output_axes.append(1, input_axes[i]);
+    }
+  }
+
+  return output_axes;
+}
+
+SpmdInfo ReductionInferSpmd(const DistMetaTensor& x,
+                            const std::vector<int>& axis,
+                            bool keep_dim) {
+  // Step0: Verify input args based on reduction logic
+  auto x_shape = phi::vectorize(x.dims());
+  int x_ndim = x_shape.size();
+  auto x_dist_attr_src = x.dist_attr();
+  std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
+  PADDLE_ENFORCE_EQ(
+      x_ndim,
+      x_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor X's rank [%d] and X's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   x_ndim,
+                                   x_dims_mapping.size()));
+
+  // Step1: Build Einsum Notation
+  // get einsum notation for input
+  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
+  std::string x_axes = alphabet.substr(0, x_ndim);
+
+  // get einsum notation for output
+  std::string out_axes = GetOutputNotation(x_ndim, alphabet, axis, keep_dim);
+
+  // Step2: Sharding Propogation
+  // Step2.1: Merge input shardings
+  std::pair<std::string, std::vector<int64_t>> x_sharding_info(x_axes,
+                                                               x_dims_mapping);
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors({x_sharding_info});
+
+  // Step2.2: Infer output dimsmapping from merged input dimsmapping
+  std::vector<int64_t> out_dims_mapping =
+      GetDimsMappingForAxes(out_axes, axis_to_dim_map);
+
+  // initialize output dist_attr's process_mesh, batch_dim and dynamic dims with
+  // input dist_attr.
+  TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  out_dist_attr.set_dims_mapping(out_dims_mapping);
+
+  // Step3: handle partial
+  // Step3.1 Output Partial
+  std::vector<int64_t> partial_on_dims =
+      ResoluteOutputPartialDimension(axis_to_dim_map, out_axes);
+  out_dist_attr.set_partial_status(
+      partial_on_dims /*, handle reduce_type in future  */);
+
+  // Step3.2  handle input tensor partial (TODO)
+  // If the op is a linear op, i.e. `linearity` is true, it supports
+  // the input to be partial. Otherwise, the input cannot be partial
+  // on reduced axes, we should reshard the input when the reduced
+  // axes are parital.
+  VLOG(4) << "ReductionInferSpmd:";
+  VLOG(4) << "axis: " << str_join(axis) << ", keep_dim: " << keep_dim;
+  VLOG(4) << "Einsum Notation: " << x_axes << " --> " << out_axes;
+  VLOG(4) << "Input0 shape: [" << str_join(x_shape) << "] "
+          << "dims_mapping: [" << str_join(x_dims_mapping) << "]";
+  VLOG(4) << "Output dims_mapping: [" + str_join(out_dims_mapping) + "] "
+          << "partial_on_dims: [" + str_join(partial_on_dims) + "]\n\n";
+
+  return {{x_dist_attr_src}, {out_dist_attr}};
+}
+
+SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x,
+                                   const DistMetaTensor& out,
+                                   const std::vector<int>& axis,
+                                   bool keep_dim) {
+  // Step0: Verify input args based on reduction logic
+  auto x_shape = phi::vectorize(x.dims());
+  auto out_shape = phi::vectorize(out.dims());
+  int x_ndim = x_shape.size();
+  int out_ndim = out_shape.size();
+  auto out_dist_attr_src = out.dist_attr();
+  std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
+  PADDLE_ENFORCE_EQ(
+      out_ndim,
+      out_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor Out's rank [%d] and Out's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   out_ndim,
+                                   out_dims_mapping.size()));
+
+  // Step1: Build einsum notation
+  // get einsum notation for input
+  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
+  std::string x_axes = alphabet.substr(0, x_ndim);
+
+  // get einsum notation for output
+  std::string out_axes = GetOutputNotation(x_ndim, alphabet, axis, keep_dim);
+
+  // Step2: Sharding propogation
+  // Step2.1: Merge input shardings
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors({{out_axes, out_dims_mapping}});
+
+  // Step2.2: Infer input dims mapping from output dims mapping
+  std::vector<int64_t> x_dims_mapping =
+      GetDimsMappingForAxes(x_axes, axis_to_dim_map, true);
+
+  // initialize input dist_attr's process_mesh, batch_dim and dynamic dims with
+  // input dist_attr.
+  TensorDistAttr x_dist_attr_dst(x.dist_attr());
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping);
+
+  // Step3: handle partial (TODO)
+
+  VLOG(4) << "ReductionInferSpmdReverse: ";
+  VLOG(4) << "Output shape:[" << str_join(out_shape) << "] dims_mapping: ["
+          << str_join(out_dims_mapping) << "]";
+  VLOG(4) << "Input0: "
+          << "shape: [" << str_join(x_shape) << "] "
+          << "dims_mapping: [" << str_join(x_dims_mapping) << "]\n\n";
+
+  return {{x_dist_attr_dst}, {out_dist_attr_src}};
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/reduction.h b/paddle/phi/infermeta/spmd_rules/reduction.h
new file mode 100644
index 0000000000000..ed9341ddc6904
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/reduction.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo ReductionInferSpmd(const DistMetaTensor& x,
+                            const std::vector<int>& axis,
+                            bool keep_dim);
+
+SpmdInfo ReductionInferSpmdReverse(const DistMetaTensor& x,
+                                   const DistMetaTensor& out,
+                                   const std::vector<int>& axis,
+                                   bool keep_dim);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index 4406e17495d14..71a726e3d8edc 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h"
 #include "paddle/phi/infermeta/spmd_rules/elementwise.h"
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
+#include "paddle/phi/infermeta/spmd_rules/reduction.h"
 #include "paddle/phi/infermeta/spmd_rules/replicated.h"
 
 /**
@@ -46,6 +47,16 @@ PD_REGISTER_SPMD_RULE(matmul,
                       PD_INFER_SPMD(phi::distributed::MatmulInferSpmd),
                       PD_INFER_SPMD(phi::distributed::MatmulInferSpmdReverse));
 
+PD_REGISTER_SPMD_RULE(
+    elementwise_unary,
+    PD_INFER_SPMD(phi::distributed::ElementwiseUnaryInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ElementwiseUnaryInferSpmdReverse));
+
+PD_REGISTER_SPMD_RULE(
+    elementwise_binary,
+    PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ElementwiseBinaryInferSpmdReverse));
+
 // default data parallel rule
 PD_REGISTER_SPMD_RULE(
     unsqueeze,
@@ -408,5 +419,43 @@ PD_REGISTER_SPMD_RULE(
 
 // TODO(pkuzyc): add multiary elementwise rule
 
+// reduction rule
+PD_REGISTER_SPMD_RULE(
+    all,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    amax,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    amin,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    any,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    frobenius_norm,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    max,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    min,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    prod,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    sum,
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
+    PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/test/auto_parallel/spmd_rules/test_reduction_rule.py b/test/auto_parallel/spmd_rules/test_reduction_rule.py
index f8069ee226583..ea8398d246fcc 100644
--- a/test/auto_parallel/spmd_rules/test_reduction_rule.py
+++ b/test/auto_parallel/spmd_rules/test_reduction_rule.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 import unittest
+from collections import OrderedDict
 
-from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
 from paddle.distributed.auto_parallel.static.dist_attribute import (
     DistTensorSpec,
     TensorDistAttr,
 )
 from paddle.distributed.fleet import auto
+from paddle.framework import core
 
 
 class TestReductionSPMDRule(unittest.TestCase):
@@ -28,7 +29,7 @@ class TestReductionSPMDRule(unittest.TestCase):
     """
 
     def setUp(self):
-        self.rule = get_spmd_rule("max")
+        self.rule = core.get_phi_spmd_rule("max")
 
         x_shape = [64, 32]
         process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
@@ -40,11 +41,7 @@ def setUp(self):
 
         self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec)
 
-        self.attrs = {
-            'keep_dim': False,
-            'axis': [0],
-            'linearity': False,
-        }
+        self.attrs = OrderedDict([('axis', [0]), ('keep_dim', False)])
 
     def test_single_mesh_dim(self):
         # reduce on dim 0, keep_dim = false
@@ -53,7 +50,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [0]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -73,7 +70,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [0]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -89,7 +86,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -104,7 +101,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -119,7 +116,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [0, 1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -135,7 +132,7 @@ def test_single_mesh_dim(self):
         self.attrs['axis'] = [0, 1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -156,7 +153,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -167,6 +164,7 @@ def test_multi_mesh_dim(self):
 
         self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[0]._is_partial(), False)
 
         # reduce on dim 1, 2, keep_dim = false
         # [-1, 0, 1] --> [-1, 0, 1], [-1], partial_on_dim:[0, 1]
@@ -174,7 +172,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -192,7 +190,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -207,7 +205,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -225,7 +223,7 @@ def test_multi_mesh_dim(self):
         self.attrs['axis'] = [1, 2]
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['axis'], self.attrs['keep_dim']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -243,7 +241,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [32]
         self.out_dist_tensor_spec.set_dims_mapping([-1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -262,7 +263,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [1, 32]
         self.out_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -277,7 +281,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [64]
         self.out_dist_tensor_spec.set_dims_mapping([0])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -292,7 +299,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [64, 1]
         self.out_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -307,7 +317,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = []
         self.out_dist_tensor_spec.set_dims_mapping([])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -322,7 +335,10 @@ def test_backward_single_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [1, 1]
         self.out_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -343,7 +359,10 @@ def test_backward_multi_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [96]
         self.out_dist_tensor_spec.set_dims_mapping([0])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -362,7 +381,10 @@ def test_backward_multi_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [96]
         self.out_dist_tensor_spec.set_dims_mapping([-1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -377,7 +399,10 @@ def test_backward_multi_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [96]
         self.out_dist_tensor_spec.set_dims_mapping([1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -392,13 +417,48 @@ def test_backward_multi_mesh_dim(self):
         self.out_dist_tensor_spec.shape = [96, 1, 1]
         self.out_dist_tensor_spec.set_dims_mapping([0, -1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec], [self.out_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+
+    def test_backward_multi_mesh_dim_parital(self):
+        # reduction on dim 1, 2, keep_dim = true, partial_dim=[1]
+        # [0, -1, -1] --> [0, -1, -1], [0, -1, -1] (output --> input, output)
+        # output parital_dim: [1], input parital_dim: []
+        out_shape = [96, 1, 1]
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+
+        self.x_dist_tensor_spec.set_process_mesh(process_mesh)
+        self.x_dist_tensor_spec.shape = [96, 24, 48]
+        out_tensor_dist_attr = TensorDistAttr()
+        out_tensor_dist_attr.dims_mapping = [0, -1, -1]
+        out_tensor_dist_attr.process_mesh = process_mesh
+        out_tensor_dist_attr._set_partial_dims([1])
+        self.out_dist_tensor_spec = DistTensorSpec(
+            out_shape, out_tensor_dist_attr
+        )
+
+        self.attrs['keep_dim'] = True
+        self.attrs['axis'] = [1, 2]
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['axis'],
+            self.attrs['keep_dim'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
 
         self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
         self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[0]._is_partial(), False)
 
 
 if __name__ == "__main__":

From 4920462600b26c1050c29ac1caafc3fac72362ba Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 20 Sep 2023 19:48:41 +0800
Subject: [PATCH 004/115] [Dy2St]Modify jit.load into Lazy Initialization Mode
 for backward program (#57240)

* [Dy2St]Modify jit.load into Lazy Initialization Mode for backward program

* fix is_test

* fix typo

* fix logic

* fix build scope logic
---
 .../eager/to_static/run_program_op_func.h     |  7 ++-
 .../eager/to_static/run_program_op_node.h     | 45 +++++++++++--------
 paddle/fluid/framework/executor_cache.cc      | 23 +++++-----
 paddle/fluid/framework/executor_cache.h       |  2 +-
 python/paddle/jit/translated_layer.py         | 22 ++++-----
 5 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index f0ca7c1518b24..a3bb3a2879300 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -140,8 +140,11 @@ inline void run_program_ad_func(
   RunProgramAPI(
       x_tmp, params_tmp, out, step_scope, dout, require_any_grad, attrs);
   VLOG(2) << "start run run_program grad";
-
-  if (require_any_grad) {
+  auto is_test = false;
+  if (attrs.count("is_test")) {
+    is_test = PADDLE_GET_CONST(bool, attrs.at("is_test"));
+  }
+  if (!is_test && require_any_grad) {
     auto x_names =
         PADDLE_GET_CONST(std::vector<std::string>, attrs.at("x_names"));
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index ebab84ccd1521..fd0d6563945a5 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -315,14 +315,16 @@ static void ShareTensorsFromScopeByValue(
 static void ShareTensorsFromScopeWithPartialBlock(
     const std::vector<Tensor *> &tensors,
     const paddle::framework::BlockDesc &forward_global_block,
-    const paddle::framework::BlockDesc &backward_global_block,
+    const paddle::framework::BlockDesc *backward_global_block,
     paddle::framework::Scope *scope) {
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto &name = tensors[i]->name();
+    bool in_forward_block = forward_global_block.HasVar(name);
+    bool in_backward_block =
+        backward_global_block && backward_global_block->HasVar(name);
     if (name == paddle::framework::kEmptyVarName ||
         name == paddle::framework::kFakeVarName ||
-        (!forward_global_block.HasVar(name) &&
-         !backward_global_block.HasVar(name))) {
+        (!in_forward_block && !in_backward_block)) {
       VLOG(2) << "find tensor name is " << name << ", skip it!";
       continue;
     }
@@ -660,10 +662,16 @@ inline void RunProgramAPI(
 
   auto *forward_global_block = PADDLE_GET_CONST(
       paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
-  auto *backward_global_block = PADDLE_GET_CONST(
-      paddle::framework::BlockDesc *, attrs.at("backward_global_block"));
   auto *forward_program = forward_global_block->Program();
-  auto *backward_program = backward_global_block->Program();
+
+  paddle::framework::BlockDesc *backward_global_block = nullptr;
+  paddle::framework::ProgramDesc *backward_program = nullptr;
+
+  if (!is_test) {
+    backward_global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *,
+                                             attrs.at("backward_global_block"));
+    backward_program = backward_global_block->Program();
+  }
 
   auto &interpretercore_info_cache =
       paddle::framework::InterpreterCoreInfoCache::Instance();
@@ -710,9 +718,12 @@ inline void RunProgramAPI(
               global_inner_scope);
     }
     // Step 3. get all eager gc vars
-    std::set<std::string> skip_eager_delete_vars =
-        paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
-            *backward_program);
+    std::set<std::string> skip_eager_delete_vars;
+    if (!is_test) {
+      skip_eager_delete_vars =
+          paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
+              *backward_program);
+    }
 
     // all out_vars are skip_eager_var
     skip_eager_delete_vars.insert(output_names.begin(), output_names.end());
@@ -765,19 +776,15 @@ inline void RunProgramAPI(
         1);
     interpreter_core->Run({});
   }
-
+  VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
   {
     paddle::platform::RecordEvent record_event(
         "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
     // Get Output
     details::ShareTensorsFromScopeWithPartialBlock(
-        out, *forward_global_block, *backward_global_block, global_inner_scope);
-    details::ShareTensorsFromScopeWithPartialBlock(dout,
-                                                   *forward_global_block,
-                                                   *backward_global_block,
-                                                   global_inner_scope);
-
-    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+        out, *forward_global_block, backward_global_block, global_inner_scope);
+    details::ShareTensorsFromScopeWithPartialBlock(
+        dout, *forward_global_block, backward_global_block, global_inner_scope);
 
     if (is_test || !require_any_grad) {
       VLOG(4) << "don't require any grad, set this scope can reused";
@@ -939,11 +946,11 @@ inline void RunProgramGradAPI(
     // Step 4. get outputs
     details::ShareTensorsFromScopeWithPartialBlock(x_grad,
                                                    *forward_global_block,
-                                                   *backward_global_block,
+                                                   backward_global_block,
                                                    global_inner_scope);
     details::ShareTensorsFromScopeWithPartialBlock(params_grad,
                                                    *forward_global_block,
-                                                   *backward_global_block,
+                                                   backward_global_block,
                                                    global_inner_scope);
     VLOG(4) << "after backward gc all vars";
     global_inner_scope->SetCanReused(true);
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 1044f785451e0..64d5ce24d20fe 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -356,7 +356,7 @@ std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
 std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     const paddle::framework::BlockDesc *forward_global_block,
     const paddle::framework::BlockDesc *backward_global_block,
-    const std::vector<std::string> output_names,
+    const std::vector<std::string> &output_names,
     const std::vector<paddle::Tensor> &x,
     const std::vector<std::string> &x_names,
     const std::vector<paddle::Tensor> &params,
@@ -415,19 +415,21 @@ std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
   }
 
   std::set<std::string> set_parameter_names;
-  for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) {
-    for (const auto &n : op_desc->Inputs()) {
-      const auto &input_var_names = n.second;
-      for (const auto &var_name : input_var_names) {
-        set_parameter_names.insert(var_name);
-      }
-    }
-  }
-
   for (auto &t : output_names) {
     set_parameter_names.insert(t);
   }
 
+  if (backward_global_block != nullptr) {
+    for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) {
+      for (const auto &n : op_desc->Inputs()) {
+        const auto &input_var_names = n.second;
+        for (const auto &var_name : input_var_names) {
+          set_parameter_names.insert(var_name);
+        }
+      }
+    }
+  }
+
   for (auto &name : set_parameter_names) {
     if (!set_output_names.count(name)) {
       continue;
@@ -443,7 +445,6 @@ std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     op_desc->SetInput("x", {name});
     op_desc->SetOutput("out", {"@EMPTY@"});
   }
-
   paddle::translator::ProgramTranslator program_translator(&local_program,
                                                            program.get());
 
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index f55808175f09f..d30ed6396e65e 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -253,7 +253,7 @@ std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
 std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     const paddle::framework::BlockDesc* forward_global_block,
     const paddle::framework::BlockDesc* backward_global_block,
-    const std::vector<std::string> output_names,
+    const std::vector<std::string>& output_names,
     const std::vector<paddle::Tensor>& x,
     const std::vector<std::string>& x_names,
     const std::vector<paddle::Tensor>& params,
diff --git a/python/paddle/jit/translated_layer.py b/python/paddle/jit/translated_layer.py
index a7f51c1a8c164..fce3211f23878 100644
--- a/python/paddle/jit/translated_layer.py
+++ b/python/paddle/jit/translated_layer.py
@@ -347,15 +347,11 @@ def __init__(self, program_desc):
         self._suffix_varname_dict = None
         # forward program
         self._infer_program_desc = self._preprocess(program_desc)
-        # forward + backward program
-        self._train_program_desc = self._append_backward_desc(
-            self._infer_program_desc
-        )
 
     # forward:
     @switch_to_static_graph
     def _create_forward_train_program(self):
-        whole_program = _build_program_by_desc(self._train_program_desc)
+        whole_program = _build_program_by_desc(self.train_program)
         end_op_index = self._infer_program_desc.block(0).op_size()
         if end_op_index > 0:
             return add_build_strategy_for(whole_program, 0, end_op_index)
@@ -369,7 +365,7 @@ def _forward_program_desc(self):
     # backward
     @switch_to_static_graph
     def _create_backward_train_program(self):
-        whole_program = _build_program_by_desc(self._train_program_desc)
+        whole_program = _build_program_by_desc(self.train_program)
         start_op_index = self._infer_program_desc.block(0).op_size() + len(
             self._output_descs
         )
@@ -389,9 +385,9 @@ def _backward_program_desc(self):
     def infer_program(self):
         return self._infer_program_desc
 
-    @property
+    @LazyInitialized
     def train_program(self):
-        return self._train_program_desc
+        return self._append_backward_desc(self._infer_program_desc)
 
     @property
     def forward_program(self):
@@ -1010,10 +1006,15 @@ def _run_dygraph(instance, input, program_holder):
             (
                 'forward_global_block',
                 forward_program.block(0),
-                'backward_global_block',
-                program_holder.backward_program.block(0),
             )
         )
+        if not instance._is_test:
+            attrs.extend(
+                (
+                    'backward_global_block',
+                    program_holder.backward_program.block(0),
+                )
+            )
 
     _legacy_C_ops.run_program(
         _valid_vars(input_vars),
@@ -1055,7 +1056,6 @@ def _run_static_graph(input, program_holder, trace_program):
         trace_program, exclude=param_var_names
     )
     trace_program.flush()
-    output_names = [var.name() for var in program_holder.output_descs]
     # append blocks from 'trace_program'
     _append_block(
         main_program,

From 70fe4b4961ce72adcf0a90532cd159e112feac58 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Wed, 20 Sep 2023 19:50:45 +0800
Subject: [PATCH 005/115] [gpups ci] (#52962)

* gpups information

* Update gpups_test.sh

* modify gpups,test=document_fix
---
 tools/gpups_test.sh | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh
index 86be766397652..31ad58a86456e 100644
--- a/tools/gpups_test.sh
+++ b/tools/gpups_test.sh
@@ -13,6 +13,20 @@
 # limitations under the License.
 
 
+function collect_failed_tests() {
+    for file in `ls $tmp_dir`; do
+        exit_code=0
+        grep -q 'The following tests FAILED:' $tmp_dir/$file||exit_code=$?
+        if [ $exit_code -ne 0 ]; then
+            failuretest=''
+        else
+            failuretest=`grep -A 10000 'The following tests FAILED:' $tmp_dir/$file | sed 's/The following tests FAILED://g'|sed '/^$/d'`
+            failed_test_lists="${failed_test_lists}
+            ${failuretest}"
+        fi
+    done
+}
+
 serial_list="^test_conv2d_op$|\
 ^test_conv2d_transpose_op$|\
 ^test_conv3d_op$"
@@ -48,7 +62,6 @@ parallel_list="^init_phi_test$|\
 ^test_dygraph_sharding_stage2_bf16$|\
 ^test_executor_feed_non_tensor$|\
 ^test_flash_attention$|\
-^test_flash_attention_deterministic$|\
 ^test_fused_adam_op$|\
 ^test_fused_attention_no_dropout$|\
 ^test_fused_attention_op$|\
@@ -93,16 +106,24 @@ parallel_list="^init_phi_test$|\
 ^test_top_k_v2_op$"
 
 cd ${work_dir}/build
-
+tmp_dir=`mktemp -d`
+tmpfile_rand=`date +%s%N`
+tmpfile=$tmp_dir/$tmpfile_rand"_"$i
 set +e
-ctest --output-on-failure -R "($parallel_list)" --timeout 120 -j4
+ctest --output-on-failure -R "($parallel_list)" --timeout 120 -j4 | tee -a $tmpfile; test ${PIPESTATUS[0]} -eq 0;
 EXIT_CODE_1=$?
 
-ctest --output-on-failure -R "($serial_list)" --timeout 120 -j1
+ctest --output-on-failure -R "($serial_list)" --timeout 120 -j1 | tee -a $tmpfile; test ${PIPESTATUS[0]} -eq 0;
 EXIT_CODE_2=$?
 set -e
 
 if [ "${EXIT_CODE_1}" != "0" ] || [ "${EXIT_CODE_2}" != "0" ];then
   echo "Sorry, some tests failed."
+  collect_failed_tests
+  rm -f $tmp_dir/*
+  echo "Summary Failed Tests... "
+  echo "========================================"
+  echo "The following tests FAILED: "
+  echo "${failuretest}" | sort -u
   exit 8
 fi

From 0cb7a2812829263dc5bab3597b7bd07127e81bd6 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Wed, 20 Sep 2023 20:13:02 +0800
Subject: [PATCH 006/115] correct default_dtype for ones, zeros, linspace,
 logspace, eye, full (#57487)

---
 python/paddle/tensor/creation.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index d6cad4b8eca34..c3e814cc906d4 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -303,7 +303,7 @@ def linspace(start, stop, num, dtype=None, name=None):
 
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.get_default_dtype()
     tensor_num = num
     tensor_start = start
     tensor_stop = stop
@@ -434,7 +434,7 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
             [1.]
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.get_default_dtype()
     tensor_num = num
     tensor_start = start
     tensor_stop = stop
@@ -1010,7 +1010,7 @@ def ones(shape, dtype=None, name=None):
              [1. 1.]]
     """
     if dtype is None:
-        dtype = core.VarDesc.VarType.FP32
+        dtype = paddle.get_default_dtype()
     return fill_constant(value=1.0, shape=shape, dtype=dtype, name=name)
 
 
@@ -1094,7 +1094,7 @@ def zeros(shape, dtype=None, name=None):
              [0. 0.]]
     """
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.get_default_dtype()
     return fill_constant(value=0.0, shape=shape, dtype=dtype, name=name)
 
 
@@ -1176,8 +1176,8 @@ def _check_attr(attr, message):
     _check_attr(num_rows, "num_rows")
 
     if dtype is None:
-        dtype = core.VarDesc.VarType.FP32
-    elif not isinstance(dtype, core.VarDesc.VarType):
+        dtype = paddle.get_default_dtype()
+    if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if num_columns is not None:
         _check_attr(num_columns, "num_columns")
@@ -1270,7 +1270,7 @@ def full(shape, fill_value, dtype=None, name=None):
     """
 
     if dtype is None:
-        dtype = 'float32'
+        dtype = paddle.get_default_dtype()
 
     return fill_constant(shape=shape, dtype=dtype, value=fill_value, name=name)
 

From be7ae2c74d19fc0ea0c1e205478389b98c537595 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Wed, 20 Sep 2023 21:13:00 +0800
Subject: [PATCH 007/115] Try to fix performance drop. (#57525)

---
 paddle/phi/kernels/gpu/flip_kernel.cu           |  7 ++++++-
 paddle/phi/kernels/gpu/index_put_grad_kernel.cu | 12 ++++++++++--
 paddle/phi/kernels/gpu/index_put_kernel.cu      |  6 +++++-
 paddle/phi/kernels/gpu/roll_kernel_impl.h       |  6 +++++-
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index f271eba26e0ab..71fdbcaaa68bb 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -40,7 +40,12 @@ __global__ void FlipCudaKernel(const T* in_data,
   int64_t cur_indices = idx;
   int64_t rem = 0;
   int64_t dst_offset = 0;
-  for (int i = 0; i < rank; ++i) {
+
+#pragma unroll
+  for (int i = 0; i < DDim::kMaxRank; ++i) {
+    if (i >= rank) {
+      break;
+    }
     int64_t temp = cur_indices;
     cur_indices = cur_indices / stride[i];
     rem = temp - cur_indices * stride[i];
diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
index 7e584e5c10318..915c7f40fa2cb 100644
--- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
@@ -40,7 +40,11 @@ __global__ void SetZeroCudaKernel(int64_t** indices,
 
   int64_t cur_ix = 0;
   int64_t offset = 0;
-  for (int i = 0; i < rank; ++i) {
+#pragma unroll
+  for (int i = 0; i < DDim::kMaxRank; ++i) {
+    if (i >= rank) {
+      break;
+    }
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
@@ -69,7 +73,11 @@ __global__ void IndexPutGradCudaKernel(
 
   int64_t cur_ix = 0;
   int64_t offset = 0;
-  for (int i = 0; i < rank; ++i) {
+#pragma unroll
+  for (int i = 0; i < DDim::kMaxRank; ++i) {
+    if (i >= rank) {
+      break;
+    }
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu
index ccbd19aaba681..3af220ce16b31 100644
--- a/paddle/phi/kernels/gpu/index_put_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_kernel.cu
@@ -41,7 +41,11 @@ __global__ void IndexPutCudaKernel(const T* x,
     return;
   }
   int64_t offset = 0;
-  for (int i = 0; i < rank; ++i) {
+#pragma unroll
+  for (int i = 0; i < DDim::kMaxRank; ++i) {
+    if (i >= rank) {
+      break;
+    }
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
index 38e2a6ff669ad..c7ffcb2d5ca52 100644
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -40,7 +40,11 @@ __global__ void RollCudaKernel(const T* input,
   int64_t output_idx = idx;
   int64_t new_dim_idx = 0;
 
-  for (size_t i = 0; i < rank; i++) {
+#pragma unroll
+  for (size_t i = 0; i < DDim::kMaxRank; i++) {
+    if (i >= rank) {
+      break;
+    }
     new_dim_idx = (output_idx / strides[i]) % sizes[i] + shifts[i];
     if (new_dim_idx >= sizes[i]) {
       output_idx += (shifts[i] - sizes[i]) * strides[i];

From c5d0e0c6b6930f8e25d24bf9c1ff189657552726 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Thu, 21 Sep 2023 07:24:41 +0800
Subject: [PATCH 008/115] sharding stage 2 main grad bug fix (#57537)

---
 python/paddle/distributed/sharding/group_sharded.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 2bbc93259eaa8..350f6eff4d001 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -28,6 +28,9 @@
 from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
     GroupShardedScaler,
 )
+from paddle.distributed.fleet.utils.mix_precision_utils import (
+    MixPrecisionOptimizer,
+)
 from paddle.distributed.utils.log_utils import get_logger
 from paddle.optimizer import Optimizer
 
@@ -111,9 +114,10 @@ def group_sharded_parallel(
     assert isinstance(
         model, paddle.nn.Layer
     ), "The model must be the instance of paddle.nn.Layer."
-    assert isinstance(
-        optimizer, Optimizer
-    ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
+    assert isinstance(optimizer, (MixPrecisionOptimizer, Optimizer)), (
+        "The optimizer must be the instance of paddle.optimizer.Optimizer "
+        "or MixPrecisionOptimizer for main grad."
+    )
     assert level in [
         'os',
         'os_g',

From 058b008e721e87c2f7b25079d49c66b47849d175 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Thu, 21 Sep 2023 09:11:47 +0800
Subject: [PATCH 009/115] =?UTF-8?q?=E3=80=90pir=E3=80=91add=20all=20Slice?=
 =?UTF-8?q?=20newir=20test=20=20(#57529)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add reference of lbfgs

* add reference of lbfgs

* tmp

* split gen modify

* fix conflict

* add split

* fix bug

* fix bug

* test split

* add meta tensor

* refine code

* fix bug

* fix bug

* fix comflict

* Call _C_ops.sum in new ir

* modify concat kernel choose

* modify ci

* modify sum zero_dim optest

* modify split_with_num api

* modify split -1

* modify split test

* fix bug

* xxx

* delete extra modify

* add add_n

* tmp

* add split_with_num_grad

* expand first

* expand first

* modify split grad num bug

* modify ci

* modify ci

* clear code

* modify

* recover

* add add_n stop_gradient infer

* modify opreslut to value

* fix conflict

* recover to aviod conflict

* recover to aviod conflict

* modify opreslut to value

* recover complex tanh

* modify add_n optest

* skip bfp16

* modify split bf16

* fix conflict

* modify expand special case

* delete print

* code style

* slice optest pass

---------

Co-authored-by: zhangbo9674 <zhangbo54@baidu.com>
Co-authored-by: 0x45f <wangzhen45@baidu.com>
---
 .../pir/dialect/op_generator/op_build_gen.py  |  1 +
 test/legacy_test/test_slice_op.py             | 56 +++++++++++++------
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index bfb20bb8e283d..33bb81e43bf64 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -19,6 +19,7 @@
     'SplitWithNumInferMeta',
     'ConcatInferMeta',
     'ReduceIntArrayAxisInferMeta',
+    'SliceRawInferMeta',
 }
 
 _PREPARE_DATA_WITH_VECTOR_INT64_MTTABLE_ATTRIBUTE = {'FrobeniusNormOp'}
diff --git a/test/legacy_test/test_slice_op.py b/test/legacy_test/test_slice_op.py
index 194e933e1d0ec..065251b246928 100644
--- a/test/legacy_test/test_slice_op.py
+++ b/test/legacy_test/test_slice_op.py
@@ -71,7 +71,11 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_prim=True
+            ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            check_prim=True,
+            check_new_ir=True,
         )
 
 
@@ -157,7 +161,11 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['Input'], 'Out', max_relative_error=0.006, check_prim=True
+            ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            check_prim=True,
+            check_new_ir=True,
         )
 
 
@@ -195,10 +203,12 @@ def config(self):
         self.starts_infer = [-1, 0, -1]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
@@ -238,10 +248,12 @@ def config(self):
         self.starts_infer = [1, -1, 2]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=True, check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 class TestSliceOp_decs_dim_5_starts_ListTensor(
@@ -289,10 +301,12 @@ def config(self):
         self.out = self.input[1, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 # Situation 4: starts(tensor), ends(tensor)
@@ -325,10 +339,12 @@ def config(self):
         self.out = self.input[1:3, 0:3, 2:4, :]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 # Situation 5: starts(tensor), ends(tensor)
@@ -362,10 +378,12 @@ def config(self):
         self.out = self.input[1, 0, 2:4, :]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 # Situation 6: starts(tensor), ends(list, have tensor)
@@ -406,10 +424,12 @@ def config(self):
         self.ends_infer = [-1, 3, 4]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.006, check_new_ir=True
+        )
 
 
 class TestSliceOp_ZeroDim(OpTest):
@@ -448,10 +468,10 @@ def config(self):
         self.out = self.input[0:20, 1:3, 1:3]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out')
+        self.check_grad(['Input'], 'Out', check_new_ir=True)
 
 
 # Test CUDA float16
@@ -499,6 +519,7 @@ def test_check_grad_normal(self):
                 ['Input'],
                 'Out',
                 check_prim=True,
+                check_new_ir=True,
             )
 
 
@@ -546,6 +567,7 @@ def test_check_grad_normal(self):
                 'Out',
                 numeric_grad_delta=0.5,
                 check_prim=True,
+                check_new_ir=True,
             )
 
 
@@ -578,7 +600,7 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['Input'], 'Out', check_prim=True)
+        self.check_grad(['Input'], 'Out', check_prim=True, check_new_ir=True)
 
 
 # Test python API

From 164abf27d2ae1d8e90691b26bc01789002535d46 Mon Sep 17 00:00:00 2001
From: Sonder <55493212+AndSonder@users.noreply.github.com>
Date: Thu, 21 Sep 2023 09:49:37 +0800
Subject: [PATCH 010/115] Support control flow for static build [Step 2:
 support conditional_block] (#56696)

* add conditional_block to OperatorBasesHandledInStaticBuild

* run op in FakeInitializeOutputsForOperatorBase

* add init_success judge

* fix build error

* fix

* add SetSubBlockCore func

* add PreStaticRun func

* add PreStaticRun to interpreter_base and new_ir_inter

* recover codes

* add PreStaticBuild and BlockCanBeStaticBuilt

* fix logic about RunPreStaticBuild

* change CreateOpFromOpDesc type

* fix build error

* fix build error

* remove IsOperatorBasesHandledInStaticBuild

* recover BlockCanBeStaticBuilt

* add logic about conditional_block run static build

* recover codes

* recover BlockCanBeStaticBuilt

* support static build condational block op when condational block is the last op in the block

* fix error

* fix logic about last op

* fit for sub block can't open static build

* add IsStaticBuild

* fix build error

* fit logic when sub block can't open static build

* close static build when sub_block don't support static_build

* recover third party

* add is_skil_fake_init logic

* set the backend of the lamb

* change start index

* add if conditional for cal is_skip_fake_init

* change name

* close static_build for test_conditional_block

* add static buiild support for conditional block in case of the output's dtype/place is changed but the following op is not use this output

* fix logic error

* fix timeout error

* fix

* remove useless codes

* fix

* fix

* fix build error

* move GetVarsInfo and RunPreStaticBuild from opeartor to static_build

* fix lamb backend registe

* fix build error

* fix build error

* remove lamp op test from new_ir_op_test_white_list

* fix

* move generating following_input_vars logic to static_build.cc

* remove HasInfo

* fix build error

* recover codes and turn off the flag
---
 .../interpreter/interpreter_util.cc           |  26 +-
 .../new_executor/interpreter/static_build.cc  | 222 ++++++++++++++++--
 .../new_executor/interpreter/static_build.h   |  38 ++-
 .../new_executor/interpreter_base_impl.h      |   6 +
 .../framework/new_executor/interpretercore.cc |   8 +
 .../framework/new_executor/interpretercore.h  |   5 +
 .../new_executor/new_ir_interpreter.cc        |   7 +
 .../new_executor/new_ir_interpreter.h         |   6 +
 .../new_executor/program_interpreter.cc       |  57 +++--
 .../new_executor/program_interpreter.h        |   6 +-
 paddle/phi/kernels/gpu/lamb_kernel.cu         |   2 +
 test/legacy_test/CMakeLists.txt               |   4 +
 test/white_list/new_ir_op_test_white_list     |   2 -
 13 files changed, 332 insertions(+), 57 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 67106932169a3..8015a50545e69 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -527,11 +527,13 @@ platform::DeviceContext* ConstructDeviceContext(const OperatorBase* op,
   return default_dev_ctx;
 }
 
-void HandleOperatorBase(const platform::Place& place,
-                        std::shared_ptr<OperatorBase> op,
-                        OpFuncNode* op_func_node,
-                        Scope* scope,
-                        bool static_build) {
+void HandleOperatorBase(
+    const platform::Place& place,
+    std::shared_ptr<OperatorBase> op,
+    OpFuncNode* op_func_node,
+    Scope* scope,
+    bool static_build,
+    std::vector<std::shared_ptr<OperatorBase>> following_ops) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
   // input, output is prepared. set the other attributes.
@@ -542,7 +544,8 @@ void HandleOperatorBase(const platform::Place& place,
     if (OperatorBasesMustRunInStaticBuild.count(op->Type())) {
       op->Run(*scope, place);
     }
-    FakeInitializeOutputsForOperatorBase(*op, place, scope);
+
+    FakeInitializeOutputsForOperatorBase(*op, place, scope, following_ops);
   } else {
     op->Run(*scope, place);  // Run without data transformer.
   }
@@ -690,8 +693,15 @@ void BuildOpFuncList(const platform::Place& place,
       if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
         VLOG(4) << "HandleOperatorBase";
         // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
-        HandleOperatorBase(
-            place, ops[i], &op_func_node, local_scope, static_build);
+
+        std::vector<std::shared_ptr<OperatorBase>> following_ops(
+            ops.begin() + i + 1, ops.end());
+        HandleOperatorBase(place,
+                           ops[i],
+                           &op_func_node,
+                           local_scope,
+                           static_build,
+                           following_ops);
         vec_func_list->emplace_back(op_func_node);
       } else {
         VLOG(4) << "OP is not null";
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
index 69b4920050925..0f9bd3f387a92 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -15,11 +15,18 @@
 #include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
 
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 
+#ifdef PADDLE_WITH_DNNL
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 // These Ops is OperatorBase, but we have been handle them in static build
-std::set<std::string> OperatorBasesHandledInStaticBuild = {"read"};
+std::set<std::string> OperatorBasesHandledInStaticBuild = {"read",
+                                                           "conditional_block"};
 
 std::set<std::string> OperatorBasesMustRunInStaticBuild = {
     "create_double_buffer_reader", "create_py_reader"};
@@ -53,11 +60,68 @@ namespace paddle {
 namespace framework {
 namespace interpreter {
 
+using InterpreterCore = framework::InterpreterCore;
+
+static VarMetaInfo GetVarMetaInfo(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  phi::DataType dtype = phi::DataType::UNDEFINED;
+  phi::Place place = phi::Place();
+  if (var == nullptr) {
+    return VarMetaInfo(name, dtype, place);
+  }
+
+  if (var->IsType<phi::DenseTensor>()) {
+    const phi::DenseTensor& tensor = var->Get<phi::DenseTensor>();
+    if (!UNLIKELY(!tensor.IsInitialized())) {
+      dtype = tensor.dtype();
+      place = tensor.place();
+    }
+  } else if (var->IsType<phi::SelectedRows>()) {
+    auto tensor = var->Get<phi::SelectedRows>().value();
+    if (!UNLIKELY(!tensor.IsInitialized())) {
+      dtype = tensor.dtype();
+      place = tensor.place();
+    }
+  }
+  return VarMetaInfo(name, dtype, place);
+}
+
+std::vector<VarMetaInfo> GetVarsInfo(const Scope* scope,
+                                     VariableNameMap var_map,
+                                     const OperatorBase& op) {
+  std::vector<VarMetaInfo> var_info;
+
+  const std::unordered_set<std::string>* no_need_buffer_vars = nullptr;
+  if (op.Info().NoNeedBufferVarsInferer()) {
+    no_need_buffer_vars = &(op.Info().NoNeedBufferVarsInferer()(
+        op.Inputs(), op.Outputs(), op.Attrs()));
+    if (no_need_buffer_vars->empty()) no_need_buffer_vars = nullptr;
+  }
+  for (auto it = var_map.begin(); it != var_map.end();) {
+    auto& var = *it;
+    bool is_no_need_buffer_var =
+        (no_need_buffer_vars && no_need_buffer_vars->count(var.first) > 0);
+    std::string var_name;
+    var_info.reserve(var_info.size() + var.second.size());
+    for (size_t i = 0; i < var.second.size(); ++i) {
+      auto var_name = var.second[i];
+      if (scope && is_no_need_buffer_var) {
+        var_info.emplace_back(GetVarMetaInfo(*scope, var_name));
+      } else {
+        var_info.emplace_back(var_name);
+      }
+    }
+    ++it;
+  }
+  return var_info;
+}
+
 bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
-  // in_black_list = (kernelCode >> 7) & 1
-  // is_operator_base = (kernelCode >> 6) & 1
-  // is_custom_op = (kernelCode >> 5) & 1
-  // use_mkldnn = (kernelCode >> 4) & 1
+  // in_black_list = (kernelCode >> 5) & 1
+  // is_operator_base = (kernelCode >> 4) & 1
+  // is_custom_op = (kernelCode >> 3) & 1
+  // use_mkldnn = (kernelCode >> 2) & 1
+  // sub_block_can_not_static_build = (kernelCode >> 1) & 1
   using KernelCode = int8_t;
   std::set<std::pair<std::string, KernelCode>> invalid_ops;
   for (auto& op : block.AllOps()) {
@@ -77,17 +141,22 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
       use_mkldnn = attr.index() == 1 ? PADDLE_GET_CONST(int, attr)
                                      : PADDLE_GET_CONST(bool, attr);
     }
-    bool has_structured_kernel =
-        phi::KernelFactory::Instance().HasStructuredKernel(op_type);
+
+    bool sub_block_can_not_static_build = false;
+    if (op->HasAttr("sub_block")) {
+      auto* sub_block =
+          PADDLE_GET_CONST(framework::BlockDesc*, op->GetAttr("sub_block"));
+      sub_block_can_not_static_build = !BlockCanBeStaticBuilt(*sub_block);
+    }
 
     KernelCode kernel_code = static_cast<KernelCode>(
-        (in_black_list << 7) + (is_operator_base << 6) + (is_custom_op << 5) +
-        (use_mkldnn << 4) + (has_structured_kernel << 2));
+        (in_black_list << 5) + (is_operator_base << 4) + (is_custom_op << 3) +
+        (use_mkldnn << 2) + (sub_block_can_not_static_build << 1));
     if (!OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
       if (in_black_list ||
           (is_operator_base &&
            !OperatorBasesHandledInStaticBuild.count(op_type)) ||
-          is_custom_op || use_mkldnn) {
+          is_custom_op || use_mkldnn || sub_block_can_not_static_build) {
         invalid_ops.insert(std::make_pair(op_type, kernel_code));
       }
     }
@@ -97,11 +166,12 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
     std::stringstream ss;
     ss << "The following OPs are unable to static build:\n";
     for (auto& item : invalid_ops) {
-      ss << item.first << " [in_black_list = " << (item.second >> 7 & 1)
-         << ", is_operator_base = " << (item.second >> 6 & 1)
-         << ", is_custom_op = " << (item.second >> 5 & 1)
-         << ", use_mkldnn = " << (item.second >> 4 & 1)
-         << (item.second >> 2 & 1) << "]\n";
+      ss << item.first << " [in_black_list = " << (item.second >> 6 & 1)
+         << ", is_operator_base = " << (item.second >> 5 & 1)
+         << ", is_custom_op = " << (item.second >> 4 & 1)
+         << ", use_mkldnn = " << (item.second >> 3 & 1)
+         << ", sub_block_can_not_static_build = " << (item.second >> 1 & 1)
+         << "]\n";
     }
     VLOG(1) << ss.str();
   }
@@ -318,9 +388,59 @@ void FakeInitializeTensorBase(const platform::DeviceContext& dev_ctx,
   }
 }
 
-void FakeInitializeOutputsForOperatorBase(const OperatorBase& op,
-                                          const phi::Place& place,
-                                          Scope* scope) {
+void RunPreStaticBuild(const framework::Scope& scope,
+                       const platform::Place& dev_place,
+                       const OperatorBase& op) {
+  auto* scope_var = scope.FindVar(op.Output("Scope"));
+  PADDLE_ENFORCE_NOT_NULL(
+      scope_var,
+      platform::errors::PreconditionNotMet(
+          "Expect Scope variable to be set in conditional_block_op, but "
+          "got a null Scope variable. Please set the Scope variable."));
+
+  auto* scopes = scope_var->GetMutable<std::vector<framework::Scope*>>();
+  scopes->resize(1);
+  scopes->front() = &scope.NewScope();
+
+  auto& cur_scope = *scopes->front();
+#ifdef PADDLE_WITH_DNNL
+  // Executor on being destroyed clears oneDNN cache and resets
+  // registered model data layout. This is unwanted for nested
+  // Executors (executors declared inside control ops)
+  platform::DontClearMKLDNNCache(dev_place);
+#endif
+  auto* block = op.Attr<framework::BlockDesc*>("sub_block");
+  VLOG(3) << "Conditional block.idx = " << block->ID()
+          << ", scope = " << &cur_scope;
+
+  auto& skip_vars =
+      op.Attr<std::vector<std::string>>("skip_eager_deletion_vars");
+
+  std::unique_ptr<InterpreterCore> core;
+  LOG_FIRST_N(INFO, 1)
+      << "[ControlFlow][ConditionalBlock] New Executor is Running.";
+
+  VLOG(10) << "[interpreterCore cache]" << core.get();
+  VLOG_IF(10, core) << platform::is_same_place(core->GetPlace(), dev_place);
+
+  framework::interpreter::ExecutionConfig execution_config;
+  execution_config.create_local_scope = false;
+  execution_config.used_for_control_flow_op = true;
+  execution_config.skip_gc_vars =
+      std::set<std::string>(skip_vars.begin(), skip_vars.end());
+
+  core.reset(
+      new InterpreterCore(dev_place, *block, &cur_scope, execution_config));
+
+  std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+  core->Build({}, &op_func_nodes);
+}
+
+void FakeInitializeOutputsForOperatorBase(
+    const OperatorBase& op,
+    const phi::Place& place,
+    Scope* scope,
+    std::vector<std::shared_ptr<OperatorBase>> following_ops) {
   const std::string& op_type = op.Type();
   if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
     return;
@@ -329,7 +449,59 @@ void FakeInitializeOutputsForOperatorBase(const OperatorBase& op,
   phi::DeviceContext* dev_ctx =
       platform::DeviceContextPool::Instance().Get(place);
 
-  if (op_type == "read") {
+  if (op_type == "conditional_block") {
+    // Note(sonder): skip fake init for conditional_block when there is no
+    // op with kernel after it.
+    bool skip_fake_init = true;
+    std::unordered_set<std::string> following_input_vars;
+
+    for (size_t i = 0; i < following_ops.size(); ++i) {
+      if (dynamic_cast<framework::OperatorWithKernel*>(
+              following_ops[i].get()) != nullptr) {
+        VLOG(4) << "Find op with kernel after conditional_block : "
+                << following_ops[i]->Type();
+        skip_fake_init = false;
+        auto input_vars_info = GetVarsInfo(
+            scope, following_ops[i]->Inputs(), *following_ops[i].get());
+        for (auto& input_var_info : input_vars_info) {
+          following_input_vars.insert(input_var_info.name_);
+        }
+      }
+    }
+
+    if (skip_fake_init) {
+      return;
+    }
+
+    const std::vector<VarMetaInfo> out_var_info_before_build =
+        GetVarsInfo(scope, op.Outputs(), op);
+
+    RunPreStaticBuild(*scope, place, op);
+    const std::vector<VarMetaInfo> out_var_info_after_build =
+        GetVarsInfo(scope, op.Outputs(), op);
+
+    // Note(sonder): static_build is not supported if the output of
+    // conditional_block is changed after static build.
+    for (size_t i = 0; i < out_var_info_before_build.size(); ++i) {
+      // static build is supported in case of the output's dtype/place
+      // is changed but the following op is not use this output
+      if (out_var_info_before_build[i] != out_var_info_after_build[i]) {
+        auto var_name = out_var_info_before_build[i].name_;
+        if (following_input_vars.count(var_name)) {
+          PADDLE_THROW(phi::errors::PreconditionNotMet(
+              "The output %s s' dtype/place of conditional_block is "
+              "changed after static build. Befer static build, the "
+              "dtype is %s, place is %s. After static "
+              "build, the dtype is %s, place is %s.",
+              var_name,
+              out_var_info_before_build[i].dtype_,
+              out_var_info_before_build[i].place_,
+              out_var_info_after_build[i].dtype_,
+              out_var_info_after_build[i].place_));
+        }
+      }
+    }
+  } else if (op_type == "read") {
     const std::string& reader_name = op.Input("Reader");
     framework::ReaderHolder* reader =
         GET_DATA_SAFELY(scope->FindVar(reader_name), "Input", "Reader", "Read")
@@ -448,6 +620,18 @@ void FakeInitializeOutputsForFunctionKernel(
             if (beta1_pow->place() == beta2_pow->place()) {
               backend = phi::TransToPhiBackend(beta1_pow->place());
             }
+          } else if (op_type == "lamb") {
+            phi::TensorBase* beta1_pow = GetTensorFormVar(
+                runtime_ctx.inputs.find("Beta1Pow")->second.at(0));
+            phi::TensorBase* beta2_pow = GetTensorFormVar(
+                runtime_ctx.inputs.find("Beta2Pow")->second.at(0));
+            if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU &&
+                beta1_pow->place().GetType() == AllocationType::CPU &&
+                beta2_pow->place().GetType() == AllocationType::CPU) {
+              backend = phi::Backend::CPU;
+            } else {
+              backend = phi::TransToPhiBackend(dev_ctx.GetPlace());
+            }
           } else if (op_type == "reshape2") {
             phi::TensorBase* x =
                 GetTensorFormVar(runtime_ctx.inputs.find("X")->second.at(0));
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.h b/paddle/fluid/framework/new_executor/interpreter/static_build.h
index e070f66b02549..302d612bc0311 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.h
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.h
@@ -23,11 +23,39 @@ namespace paddle {
 namespace framework {
 namespace interpreter {
 
+struct VarMetaInfo {
+  std::string name_;
+  phi::DataType dtype_;
+  phi::Place place_;
+
+  explicit VarMetaInfo(const std::string& name) : name_(name) {
+    dtype_ = phi::DataType::UNDEFINED;
+    place_ = phi::Place();
+  }
+
+  VarMetaInfo(const std::string& name,
+              const phi::DataType& dtype,
+              const platform::Place& place)
+      : name_(name), dtype_(dtype), place_(place) {}
+
+  bool operator==(const VarMetaInfo& other) const {
+    return name_ == other.name_ && dtype_ == other.dtype_ &&
+           place_ == other.place_;
+  }
+
+  bool operator!=(const VarMetaInfo& other) const {
+    return name_ != other.name_ || dtype_ != other.dtype_ ||
+           place_ != other.place_;
+  }
+};
+
 bool BlockCanBeStaticBuilt(const framework::BlockDesc& block);
 
-void FakeInitializeOutputsForOperatorBase(const OperatorBase& op,
-                                          const platform::Place& place,
-                                          Scope* scope);
+void FakeInitializeOutputsForOperatorBase(
+    const OperatorBase& op,
+    const phi::Place& place,
+    Scope* scope,
+    std::vector<std::shared_ptr<OperatorBase>> following_ops);
 
 void FakeInitializeOutputsForFunctionKernel(
     const framework::OperatorBase& op,
@@ -40,6 +68,10 @@ void FakeInitializeOutputsForStructureKernel(
     const framework::OpKernelType& op_kernel_type,
     ExecutionContext* execution_context);
 
+std::vector<VarMetaInfo> GetVarsInfo(const Scope* scope,
+                                     VariableNameMap var_map,
+                                     const OperatorBase& op);
+
 }  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 2c030ef1dc264..369216e0078c4 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -97,6 +97,12 @@ class InterpreterBaseImpl {
   virtual std::shared_ptr<std::vector<size_t>> GetDependencyCount() const = 0;
 
   virtual bool IsSharedResultsBuild() const = 0;
+
+  virtual void Build(
+      const std::vector<std::string>& feed_names,
+      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) = 0;
+
+  virtual bool IsStaticBuild() const = 0;
 };
 
 inline void SetDeviceId(const platform::Place& place) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index dc8110331a176..8e052d3b2685e 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -121,5 +121,13 @@ void InterpreterCore::SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
   impl_->SetOutputHooks(hookfuncs);
 }
 
+void InterpreterCore::Build(
+    const std::vector<std::string>& feed_names,
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  impl_->Build(feed_names, op_func_nodes);
+}
+
+bool InterpreterCore::IsStaticBuild() const { return impl_->IsStaticBuild(); }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 47f2d9c6a3378..d21bd9e1fc378 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -74,6 +74,11 @@ class InterpreterCore {
 
   void SetOutputHooks(const std::vector<HookFunc>& hookfuncs);
 
+  void Build(const std::vector<std::string>& feed_names,
+             std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
+
+  bool IsStaticBuild() const;
+
  private:
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 6b6cabb991382..55f70a573a1bc 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -1315,6 +1315,13 @@ void NewIRInterpreter::PreAnalysis() {
   VLOG(4) << "Done UpdateNcclOpNum";
 }
 
+void NewIRInterpreter::Build(
+    const std::vector<std::string>& feed_names,
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Build is not implemented in NewIRInterpreter."));
+}
+
 ::pir::Value NewIRInterpreter::GetValueByName(const std::string& var_name) {
   for (auto kv : value_2_var_name_) {
     if (kv.second == var_name) {
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index cf5cb21ce81aa..c05eb6770b2ba 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -100,6 +100,12 @@ class NewIRInterpreter : public InterpreterBaseImpl {
   void CheckCUDAGraphBeforeRun(const std::vector<std::string>& feed_names);
   void PrepareForCUDAGraphCapture();
 
+  void Build(
+      const std::vector<std::string>& feed_names,
+      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) override;
+
+  bool IsStaticBuild() const override { return static_build_; }
+
   // workqueue
   std::shared_ptr<interpreter::AsyncWorkQueue> GetWorkQueue();
 
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index a29e45515d894..1384a9fb487de 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -52,10 +52,6 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
       var_scope_(scope) {
   VLOG(4) << "ProgramInterpreter(): " << this << " on " << place_;
 
-  static_build_ = FLAGS_new_executor_static_build &&
-                  !FLAGS_new_executor_use_cuda_graph &&
-                  interpreter::BlockCanBeStaticBuilt(block);
-
   exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
   completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
 
@@ -73,6 +69,10 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
   }
   var_scope_.SetLocalScope(local_scope_);
 
+  static_build_ = FLAGS_new_executor_static_build &&
+                  !FLAGS_new_executor_use_cuda_graph &&
+                  interpreter::BlockCanBeStaticBuilt(block);
+
   instruction_scheduling_priority_less = [this](size_t lhs, size_t rhs) {
     SchedulingPriority lhs_scheduling_priority =
         vec_instruction_[lhs].GetSchedulingPriority();
@@ -129,28 +129,10 @@ void ProgramInterpreter::RunImpl() {
 
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
                                   bool need_fetch) {
-  SetDeviceId(place_);
-  CheckCUDAGraphBeforeRun(feed_names);
-
-#ifdef PADDLE_WITH_DNNL
-  platform::AttachPointerHashToMKLDNNKey(this, place_);
-#endif
+  std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+  Build(feed_names, &op_func_nodes);
 
   if (!is_build_) {
-    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
-    paddle::framework::interpreter::BuildVariableScope(
-        block_, execution_config_, &var_scope_);
-
-    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
-    paddle::framework::interpreter::BuildOpFuncList(
-        place_,
-        block_,
-        execution_config_.skip_gc_vars,
-        &op_func_nodes,
-        &var_scope_,
-        execution_config_,
-        HasLocalScope(),
-        static_build_);
     SetFeedVarsInplaceSkip(feed_names);
     // convert vec func_list to graph
     Convert(&op_func_nodes);
@@ -189,6 +171,33 @@ FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
   }
 }
 
+void ProgramInterpreter::Build(
+    const std::vector<std::string>& feed_names,
+    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
+  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
+
+#ifdef PADDLE_WITH_DNNL
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
+
+  if (!is_build_) {
+    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
+    paddle::framework::interpreter::BuildVariableScope(
+        block_, execution_config_, &var_scope_);
+
+    paddle::framework::interpreter::BuildOpFuncList(
+        place_,
+        block_,
+        execution_config_.skip_gc_vars,
+        op_func_nodes,
+        &var_scope_,
+        execution_config_,
+        HasLocalScope(),
+        static_build_);
+  }
+}
+
 FetchList ProgramInterpreter::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors) {
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 27348d57fcd17..bef6385c211fb 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -48,6 +48,10 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
                                    bool need_fetch = true) override;
 
+  void Build(
+      const std::vector<std::string>& feed_names,
+      std::vector<paddle::framework::OpFuncNode>* op_func_nodes) override;
+
   void ShareWorkQueueFrom(InterpreterBaseImpl* src) override;
 
   void ShareBuildResultsFrom(const InterpreterBaseImpl& src) override;
@@ -92,7 +96,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
     force_evnets_to_wait_ = force_evnets_to_wait;
   }
 
-  bool IsStaticBuild() const { return static_build_; }
+  bool IsStaticBuild() const override { return static_build_; }
 
  private:
   // build graph
diff --git a/paddle/phi/kernels/gpu/lamb_kernel.cu b/paddle/phi/kernels/gpu/lamb_kernel.cu
index 220fa97a0e107..c1d1a812a881e 100644
--- a/paddle/phi/kernels/gpu/lamb_kernel.cu
+++ b/paddle/phi/kernels/gpu/lamb_kernel.cu
@@ -33,4 +33,6 @@ PD_REGISTER_KERNEL(lamb,
   kernel->OutputAt(3).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(4).SetDataType(phi::DataType::UNDEFINED);
   kernel->OutputAt(5).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED);
+  kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED);
 }
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 5e000112784aa..9e7adef0a634f 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -1272,6 +1272,10 @@ set_tests_properties(
 set_tests_properties(
   test_cuda_graph_static_mode_error
   PROPERTIES ENVIRONMENT "FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=1")
+# In test_conditional_block, the sub block changes the dtype and place of the output variable.
+# The changed variable is used in the following op. Static build is not supported for this case.
+set_tests_properties(test_conditional_block
+                     PROPERTIES ENVIRONMENT "FLAGS_new_executor_static_build=0")
 
 # These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default.
 set(STATIC_BUILD_TESTS
diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list
index 613769ec5b657..b85c88fa6bb18 100644
--- a/test/white_list/new_ir_op_test_white_list
+++ b/test/white_list/new_ir_op_test_white_list
@@ -116,8 +116,6 @@ test_kron_op
 test_kthvalue_op
 test_label_smooth_op
 test_label_smooth_op_new_ir
-test_lamb_op
-test_lamb_op_static_build
 test_lerp_op
 test_lgamma_op
 test_linear_interp_v2_op

From 33d8ee204897a27ccbbb81a052b81cd1dbdf04fe Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 21 Sep 2023 09:55:35 +0800
Subject: [PATCH 011/115] [Pir] Support Run with feed_tensor (#57497)

* refine

* add flag

* add ut
---
 .../new_executor/new_ir_interpreter.cc        | 115 +++++++++++++++++-
 test/cpp/new_executor/CMakeLists.txt          |   1 +
 .../standalone_executor_new_ir_test.cc        |  81 ++++++++++++
 3 files changed, 195 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 55f70a573a1bc..47823eb82b428 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -935,8 +935,108 @@ void NewIRInterpreter::ConstructEventForJitInput() {
 paddle::framework::FetchList NewIRInterpreter::Run(
     const std::vector<std::string>& feed_names,
     const std::vector<phi::DenseTensor>& feed_tensors) {
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "Run with feed_tensors is not implemented in NewIRInterpreter."));
+  auto FeedInput = [&] {
+    VLOG(4) << "Feed inputs";
+    for (size_t i = 0; i < feed_names.size(); ++i) {
+      auto* feed_var = InnerScope()->FindVar(feed_names[i]);
+      PADDLE_ENFORCE_NOT_NULL(
+          feed_var,
+          platform::errors::NotFound("Variable %s should not be nullptr.",
+                                     feed_names[i]));
+
+      auto feed_tensor = feed_var->GetMutable<phi::DenseTensor>();
+      feed_tensor->ShareDataWith(feed_tensors[i]);
+      feed_tensor->set_lod(feed_tensors[i].lod());
+    }
+  };
+
+  SetDeviceId(place_);
+  CheckCUDAGraphBeforeRun(feed_names);
+
+#ifdef PADDLE_WITH_DNNL
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
+
+  FeedInput();
+
+  if (!is_build_) {
+    LOG_FIRST_N(INFO, 1) << "New Executor is BetaRunning.";
+    // Build
+    VLOG(4) << "Done BuildScope";
+    VLOG(4) << DebugValueInfo();
+
+    SolvePersisableVarNames();
+
+    VLOG(4) << "Parameter value include: ";
+    for (auto parameter : parameter_var_names_) {
+      VLOG(4) << "Parameter value: " << parameter;
+    }
+
+    BuildInstruction();
+    VLOG(4) << "Done BuildInstruction";
+
+    PreAnalysis();
+    VLOG(4) << "Done PreAnalysis";
+
+    // Run
+    if (FLAGS_enable_new_ir_in_executor_trace_run || nccl_op_num_ > 1 ||
+        ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
+         (sync_op_num_ == 0))) {
+      LOG_FIRST_N(INFO, 1) << "New ir interpreter is running in BetaRun mode "
+                              "with trace version.";
+      TraceRunImpl();
+    } else {
+      LOG_FIRST_N(INFO, 1) << "New ir interpreter is running in BetaRun mode "
+                              "with multi thread version.";
+      MultiThreadRunImpl();
+    }
+
+    is_build_ = true;
+    is_shared_results_build_ = true;
+  } else {
+    if (FLAGS_enable_new_ir_in_executor_trace_run || nccl_op_num_ > 1 ||
+        ((execution_config_.used_for_jit || execution_config_.used_for_cinn) &&
+         (sync_op_num_ == 0))) {
+      TraceRunImpl();
+    } else {
+      MultiThreadRunImpl();
+    }
+  }
+
+  if (HasLocalScope()) {
+    ClearLoDTensorArrayInLocalScope();
+  }
+  // return Fetch Tensors
+  Scope* inner_scope = InnerScope();
+  if (FLAGS_enable_new_ir_in_executor) {
+    framework::FetchList fetch_res;
+
+    for (auto& var_name : fetch_var_names_) {
+      auto* var = inner_scope->FindVar(var_name);
+      VLOG(0) << "fetch " << var_name << "[" << var << "]";
+      fetch_res.push_back(var->Get<phi::DenseTensor>());
+    }
+
+    VLOG(4) << "get fetch list size: " << fetch_res.size();
+    return fetch_res;
+  } else {
+    auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
+    if (fetch_var) {
+      auto fetch_list =
+          std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+      if (platform::IsCUDAGraphCapturing()) {
+        PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "Cannot fetch data when using CUDA Graph."));
+      }
+#endif
+      return fetch_list;
+    } else {
+      return {};
+    }
+  }
 }
 
 FetchList NewIRInterpreter::Run(const std::vector<std::string>& feed_names,
@@ -1252,6 +1352,16 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
     VLOG(4) << "begin to run op " << instr_node->Name();
     if (!instr_node->IsArtificial()) {
       instr_node->Run();
+
+      if (FLAGS_benchmark) {
+        instr_node->DeviceContext().Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+        VLOG(4) << "Operator(" << instr_node->Name()  // NOLINT
+                << "): context wait and get last error";
+#endif
+      }
+
       VLOG(4) << __func__ << " OP id:" << instr_node->Id()
               << " name:" << instr_node->Name() << " type:"
               << (instr_node->KernelType() == OpFuncType::kCpuSync
@@ -1260,6 +1370,7 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
                              ? "kGpuSync"
                              : "kGpuAsync"))
               << " runs on " << platform::GetCurrentThreadName();
+
       VLOG(4) << "done instruction node run";
       CheckGC(instr_node);
       VLOG(4) << "done CheckGC";
diff --git a/test/cpp/new_executor/CMakeLists.txt b/test/cpp/new_executor/CMakeLists.txt
index 00285e39f518b..af09520b12a54 100644
--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
@@ -10,6 +10,7 @@ if(NOT WIN32)
     pd_op_dialect
     pd_kernel_dialect
     pir
+    phi
     standalone_executor)
 endif()
 
diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
index d200b2a1052ed..eac996ffebe0f 100644
--- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc
+++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
@@ -97,6 +97,87 @@ TEST(StandaloneExecutor, run) {
   EXPECT_EQ(res3, true);
 }
 
+TEST(StandaloneExecutor, run_feed_tensor) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Builder builder = pir::Builder(ctx, program.block());
+
+  pir::OpInfo feed_op_info =
+      ctx->GetRegisteredOpInfo(paddle::dialect::FeedOp::name());
+
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  phi::DDim dims = {1};
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0}};
+  size_t offset = 0;
+  pir::Type dense_tensor_dtype = paddle::dialect::DenseTensorType::get(
+      ctx, fp32_dtype, dims, data_layout, lod, offset);
+
+  pir::AttributeMap attr_map1;
+  attr_map1.insert(std::pair<std::string, pir::Attribute>(
+      "name", pir::StrAttribute::get(ctx, "x")));
+  attr_map1.insert(std::pair<std::string, pir::Attribute>(
+      "col", pir::Int32Attribute::get(ctx, 0)));
+  pir::Operation* feed_op1 =
+      pir::Operation::Create({}, attr_map1, {dense_tensor_dtype}, feed_op_info);
+  program.block()->push_back(feed_op1);
+
+  pir::AttributeMap attr_map2;
+  attr_map2.insert(std::pair<std::string, pir::Attribute>(
+      "name", pir::StrAttribute::get(ctx, "y")));
+  attr_map2.insert(std::pair<std::string, pir::Attribute>(
+      "col", pir::Int32Attribute::get(ctx, 0)));
+  pir::Operation* feed_op2 =
+      pir::Operation::Create({}, attr_map2, {dense_tensor_dtype}, feed_op_info);
+  program.block()->push_back(feed_op2);
+
+  builder.Build<paddle::dialect::AddOp>(feed_op1->result(0),
+                                        feed_op2->result(0));
+
+  auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program);
+
+  auto place = platform::CPUPlace();
+  Scope scope;
+  InterpreterCore test_core(place, {}, kernel_program->block(), &scope);
+
+  std::stringstream os;
+  os << reinterpret_cast<NewIRInterpreter*>(
+      const_cast<InterpreterBaseImpl*>(test_core.Impl()));
+  std::string out_name = os.str() + "_inner_var_2";
+  test_core.SetSkipGcVars({out_name});
+
+  phi::DenseTensorMeta meta(
+      phi::DataType::FLOAT32, dims, data_layout, lod, offset);
+  paddle::platform::DeviceContext* dev_ctx =
+      paddle::platform::DeviceContextPool::Instance().Get(
+          paddle::platform::CPUPlace());
+
+  phi::DenseTensor tensor_x;
+  tensor_x.set_meta(meta);
+  dev_ctx->Alloc(&tensor_x, phi::DataType::FLOAT32);
+  float* tensor_x_data = tensor_x.data<float>();
+  *tensor_x_data = 1.0;
+
+  phi::DenseTensor tensor_y;
+  tensor_y.set_meta(meta);
+  dev_ctx->Alloc(&tensor_y, phi::DataType::FLOAT32);
+  float* tensor_y_data = tensor_y.data<float>();
+  *tensor_y_data = 2.0;
+
+  test_core.Run({"x", "y"}, {tensor_x, tensor_y});
+
+  auto out_tensor =
+      test_core.local_scope() == nullptr
+          ? scope.FindVar(out_name)->Get<phi::DenseTensor>()
+          : test_core.local_scope()->FindVar(out_name)->Get<phi::DenseTensor>();
+
+  bool res0 = simple_cmp(out_tensor.data<float>()[0], 3.0);
+  EXPECT_EQ(res0, true);
+}
+
 TEST(StandaloneExecutor, run_inplace_sqrt) {
   pir::IrContext* ctx = pir::IrContext::Instance();
   pir::Program program((ctx));

From 2e5a6fbadef0fe215f08baba15dcecdf8039c7c6 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:12:09 +0800
Subject: [PATCH 012/115] [Pir] delete support mutable attribute for pow
 (#57503)

* refien

* fix bug

* fix

* refine
---
 .../fluid/pir/dialect/op_generator/api_gen.py |  8 +++++
 .../fluid/pir/dialect/op_generator/op_gen.py  |  8 +++++
 paddle/fluid/primitive/codegen/gen.py         |  2 +-
 test/legacy_test/test_activation_op.py        | 32 ++-----------------
 4 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
index 5a3afdf2036a9..d7e74f72b652f 100644
--- a/paddle/fluid/pir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -125,6 +125,14 @@ def _parse_yaml(self, op_yaml_files, op_compat_yaml_file):
                 op_compat_item = op_compat_parser.get_compat(
                     op['forward']['name']
                 )
+
+            if (
+                op_compat_item is not None
+                and op_compat_item['op'] == "pow"
+                and 'scalar' in op_compat_item
+            ):
+                op_compat_item = op_compat_item.pop('scalar')
+
             op_info_items.append(OpInfoParser(op, op_compat_item))
         return op_info_items
 
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 62e746044776d..46949bcb547a7 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -858,6 +858,14 @@ def OpGenerator(
             and 'forward' in op
         ):
             op_compat_item = op_compat_parser.get_compat(op['forward']['name'])
+
+        if (
+            op_compat_item is not None
+            and op_compat_item['op'] == "pow"
+            and 'scalar' in op_compat_item
+        ):
+            op_compat_item = op_compat_item.pop('scalar')
+
         op_info_items[op['name']] = OpInfoParser(op, op_compat_item)
     # (3) CodeGen: Traverse op_info_items and generate
     ops_name_list = []  # all op class name store in this list
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 0239f3d702e96..f9a920730967d 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -291,7 +291,7 @@ def extend_compat_info(apis, compats):
                 backward_apis.append(apis_dict[backward_op_name])
         support_tensor_attrs_names = []
         compat_attrs_data_type = {}
-        if 'scalar' in compat_item:
+        if 'scalar' in compat_item and compat_item['op'] != "pow":
             for attr_name, attr_info in compat_item['scalar'].items():
                 if (
                     'support_tensor' in attr_info
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 703cc4174d8f5..8b16ee5750eac 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -3588,33 +3588,7 @@ def init_shape(self):
         self.shape = []
 
 
-class TestPow_factor_tensor(TestActivation):
-    def setUp(self):
-        self.op_type = "pow"
-        self.python_api = paddle.pow
-        self.enable_cinn = False
-        self.init_dtype()
-
-        np.random.seed(1024)
-        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
-        out = np.power(x, 3)
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_base_dtype(x),
-            'FactorTensor': np.array([3.0]).astype(self.dtype),
-        }
-
-        self.attrs = {}
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out')
-
+class TestPow_API(TestActivation):
     def test_api(self):
         with static_guard():
             input = np.random.uniform(1, 2, [11, 17]).astype("float32")
@@ -4526,7 +4500,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestLog1p)
 create_test_act_fp16_class(TestSquare)
 create_test_act_fp16_class(TestPow, check_prim=True)
-create_test_act_fp16_class(TestPow_factor_tensor)
+create_test_act_fp16_class(TestPow_API)
 create_test_act_fp16_class(TestSTanh)
 create_test_act_fp16_class(TestSoftplus)
 create_test_act_fp16_class(TestSoftsign)
@@ -4657,7 +4631,7 @@ def test_check_grad(self):
 create_test_act_bf16_class(TestLog1p)
 create_test_act_bf16_class(TestSquare)
 create_test_act_bf16_class(TestPow, check_prim=True)
-create_test_act_bf16_class(TestPow_factor_tensor)
+create_test_act_bf16_class(TestPow_API)
 create_test_act_bf16_class(TestSTanh)
 create_test_act_bf16_class(TestSoftplus)
 create_test_act_bf16_class(TestSoftsign)

From 00bd3aa99f33add638f567998b74e07323e2b2b9 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 21 Sep 2023 10:29:31 +0800
Subject: [PATCH 013/115] update get/set parameter (#57539)

---
 .../pir/dialect/operator/ir/api_builder.cc    | 11 ++++++++++
 .../pir/dialect/operator/ir/api_builder.h     |  6 +++++
 .../pir/dialect/operator/ir/manual_api.cc     | 21 ++++++------------
 .../pir/dialect/operator/ir/manual_api.h      |  4 +---
 paddle/fluid/pybind/ir.cc                     | 15 +++++++++++++
 .../fluid/pybind/manual_static_op_function.h  |  7 +-----
 python/paddle/base/data_feeder.py             |  2 +-
 python/paddle/base/executor.py                | 11 +++++-----
 python/paddle/ir/core.py                      | 22 +++++++++++++------
 python/paddle/nn/initializer/constant.py      |  7 +++++-
 python/paddle/nn/initializer/xavier.py        | 13 ++++++-----
 python/paddle/tensor/math.py                  |  2 --
 test/ir/new_ir/test_build_model.py            | 12 +++++-----
 13 files changed, 82 insertions(+), 51 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
index 893c664b78b08..0662ced1cb40c 100644
--- a/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
@@ -48,5 +48,16 @@ void APIBuilder::ResetInsertionPointToEnd() {
   builder_->SetInsertionPointToEnd(builder_->block());
 }
 
+pir::Parameter* APIBuilder::GetParameter(const std::string& name) const {
+  pir::Program* program = builder_->block()->GetParentOp()->GetParentProgram();
+  return program->GetParameter(name);
+}
+
+void APIBuilder::SetParameter(const std::string& name,
+                              std::unique_ptr<pir::Parameter>&& parameter) {
+  pir::Program* program = builder_->block()->GetParentOp()->GetParentProgram();
+  program->SetParameter(name, std::move(parameter));
+}
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/api_builder.h b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
index a06f529d2c5be..060102de4bde0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/api_builder.h
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
@@ -17,6 +17,7 @@
 
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/macros.h"
+#include "paddle/pir/core/parameter.h"
 #include "paddle/pir/core/program.h"
 
 namespace paddle {
@@ -40,6 +41,11 @@ class APIBuilder {
 
   void ResetInsertionPointToEnd();
 
+  pir::Parameter* GetParameter(const std::string& name) const;
+
+  void SetParameter(const std::string& name,
+                    std::unique_ptr<pir::Parameter>&& parameter);
+
   std::shared_ptr<pir::Builder> GetBuilder() { return builder_; }
 
  private:
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
index ba8fc47744ed3..24e7a94b66650 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/builtin_op.h"
-
+#include "paddle/pir/core/parameter.h"
 namespace paddle {
 namespace dialect {
 
@@ -46,25 +46,18 @@ pir::OpResult zeros_like(pir::Value x,
   return paddle::dialect::full_like(x, 0, dtype, place);
 }
 
-pir::OpResult get_parameter(const std::string& name,
-                            phi::DataType dtype,
-                            const std::vector<int64_t>& shape) {
-  phi::LoD lod;
-  size_t offset{0};
-  pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      pir::IrContext::Instance(),
-      TransToIrDataType(dtype),
-      phi::DDim(shape.data(), shape.size()),
-      phi::DataLayout::UNDEFINED,
-      lod,
-      offset);
+pir::OpResult get_parameter(const std::string& name) {
+  pir::Parameter* param = APIBuilder::Instance().GetParameter(name);
   pir::GetParameterOp get_parameter_op =
       APIBuilder::Instance().GetBuilder()->Build<pir::GetParameterOp>(
-          name, out_dense_tensor_type);
+          name, param->type());
   return get_parameter_op.result(0);
 }
 
 void set_parameter(pir::Value parameter, const std::string& name) {
+  std::unique_ptr<pir::Parameter> param(
+      new pir::Parameter(nullptr, 0, parameter.type()));
+  APIBuilder::Instance().SetParameter(name, std::move(param));
   APIBuilder::Instance().GetBuilder()->Build<pir::SetParameterOp>(parameter,
                                                                   name);
 }
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
index 7e5aba6fcbaa8..c919448f1ddb0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
@@ -32,9 +32,7 @@ pir::OpResult zeros_like(pir::Value x,
                          phi::DataType dtype = phi::DataType::UNDEFINED,
                          const Place& place = {});
 
-pir::OpResult get_parameter(const std::string& name,
-                            phi::DataType dtype,
-                            const std::vector<int64_t>& shape);
+pir::OpResult get_parameter(const std::string& name);
 
 void set_parameter(pir::Value parameter, const std::string& name);
 
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index db3faebb1985b..913d7d6f7aa80 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -153,6 +153,11 @@ void BindProgram(py::module *m) {
            [](const std::shared_ptr<Program> &self) {
              return self->parameters_num();
            })
+      .def("move_parameters_from",
+           [](const std::shared_ptr<Program> &self,
+              const std::shared_ptr<Program> &other) {
+             self->set_parameters(std::move(other->parameters()));
+           })
       .def(
           "global_block",
           [](std::shared_ptr<Program> self) { return self->block(); },
@@ -375,9 +380,19 @@ void BindOpOperand(py::module *m) {
 bool GetOpResultBoolAttr(const OpResult &self, const std::string &attr_name) {
   auto *defining_op = self.owner();
   if (defining_op->HasAttribute(attr_name)) {
+    PADDLE_ENFORCE(
+        defining_op->attribute(attr_name).isa<pir::ArrayAttribute>(),
+        paddle::platform::errors::InvalidArgument(
+            "%s: Callstack attributes of %s is not ArrayAttribute type",
+            attr_name));
     auto attrs = defining_op->attribute(attr_name)
                      .dyn_cast<pir::ArrayAttribute>()
                      .AsVector();
+    PADDLE_ENFORCE(attrs[self.index()].isa<pir::BoolAttribute>(),
+                   paddle::platform::errors::InvalidArgument(
+                       "The index %d in %s is not BoolAttribute type",
+                       self.index(),
+                       attr_name));
     return attrs[self.index()].dyn_cast<pir::BoolAttribute>().data();
   } else {
     return true;
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index 68b9e22ec7f94..7c32b2ab1d4fa 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -35,13 +35,8 @@ static PyObject *static_api_get_parameter(PyObject *self,
     // Parse Attributes
     PyObject *name_obj = PyTuple_GET_ITEM(args, 0);
     std::string name = CastPyArg2String(name_obj, "name", 0);
-    PyObject *dtype_obj = PyTuple_GET_ITEM(args, 1);
-    phi::DataType dtype = CastPyArg2DataTypeDirectly(dtype_obj, "dtype", 1);
-    PyObject *shape_obj = PyTuple_GET_ITEM(args, 2);
-    phi::IntArray shape = CastPyArg2IntArray(shape_obj, "shape", 2);
     // Call ir static api
-    auto static_api_out =
-        paddle::dialect::get_parameter(name, dtype, shape.GetData());
+    auto static_api_out = paddle::dialect::get_parameter(name);
 
     return ToPyObject(static_api_out);
   } catch (...) {
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index 40154e1a0d429..78781a6856af1 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -47,7 +47,7 @@
 _PADDLE_NEW_IR_DTYPE_2_NUMPY_DTYPE = {
     core.DataType.BOOL: 'bool',
     core.DataType.FLOAT16: 'float16',
-    core.DataType.UINT16: 'uint16',
+    core.DataType.BFLOAT16: 'uint16',
     core.DataType.FLOAT32: 'float32',
     core.DataType.FLOAT64: 'float64',
     core.DataType.INT8: 'int8',
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 9ea3d566c824a..e5fddd15329e3 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -515,11 +515,12 @@ def _add_pir_fetch_ops(program, fetch_list, fetch_var_name):
     if not has_fetch_operations(
         global_block, fetch_list, fetch_var_name, fetch_op
     ):
-        for i, fetch_input in enumerate(fetch_list):
-            assert isinstance(
-                fetch_input, OpResult
-            ), "Wrong type for fetch_list[%s]: %s" % (i, type(fetch_input))
-            paddle._ir_ops.fetch(fetch_input, fetch_var_name + str(i), i)
+        with paddle.static.program_guard(program):
+            for i, fetch_input in enumerate(fetch_list):
+                assert isinstance(
+                    fetch_input, OpResult
+                ), "Wrong type for fetch_list[%s]: %s" % (i, type(fetch_input))
+                paddle._ir_ops.fetch(fetch_input, fetch_var_name + str(i), i)
 
 
 def _merge_tensors(tensor, micro_batch_num):
diff --git a/python/paddle/ir/core.py b/python/paddle/ir/core.py
index 0ce01ebb3f593..908319458ed39 100644
--- a/python/paddle/ir/core.py
+++ b/python/paddle/ir/core.py
@@ -251,6 +251,12 @@ def program_guard(main_program, startup_program=None):
             switch_startup_program(startup_program)
 
 
+class ParameterMeta:
+    def __init__(self, shape, dtype):
+        self.shape = shape
+        self.dtype = dtype
+
+
 def create_parameter(
     dtype,
     shape,
@@ -266,19 +272,21 @@ def create_parameter(
     op_result_name = unique_name.generate('parameter')
     startup_program = default_startup_program()
     main_program = default_main_program()
-
-    with program_guard(default_main_program()):
-        param = get_parameter(op_result_name, dtype, shape)
-        trainable = kwargs.get('trainable', True)
-        param.stop_gradient = not trainable
-        param.is_persistable = True
+    parameter_meta = ParameterMeta(shape, dtype)
 
     with program_guard(startup_program):
         initializer = kwargs['initializer']
         init_result = initializer(
-            param, param.get_defining_op().get_parent_block()
+            parameter_meta, startup_program.global_block()
         )
         init_result.is_persistable = True
         set_parameter(init_result, op_result_name)
 
+    main_program.move_parameters_from(startup_program)
+    with program_guard(default_main_program()):
+        param = get_parameter(op_result_name, dtype, shape)
+        trainable = kwargs.get('trainable', True)
+        param.stop_gradient = not trainable
+        param.is_persistable = True
+
     return param
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index bc2baf08c9bb1..b4e9ee1df266a 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -58,7 +58,12 @@ def forward(self, var, block=None):
 
         assert isinstance(
             var,
-            (framework.Variable, framework.EagerParamBase, paddle.ir.OpResult),
+            (
+                framework.Variable,
+                framework.EagerParamBase,
+                paddle.ir.OpResult,
+                paddle.ir.core.ParameterMeta,
+            ),
         )
         assert isinstance(block, (framework.Block, paddle.ir.Block))
 
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 7f479111fba3d..40eb6a874c9da 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -88,12 +88,13 @@ def forward(self, var, block=None):
 
         block = self._check_block(block)
         assert isinstance(block, (framework.Block, paddle.ir.Block))
-        check_variable_and_dtype(
-            var,
-            "Out",
-            ["uint16", "float16", "float32", "float64"],
-            "xavier_init",
-        )
+        if not isinstance(var, paddle.ir.core.ParameterMeta):
+            check_variable_and_dtype(
+                var,
+                "Out",
+                ["uint16", "float16", "float32", "float64"],
+                "xavier_init",
+            )
 
         f_in, f_out = self._compute_fans(var)
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 5cdd91b075426..56c553bce797e 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -889,8 +889,6 @@ def divide(x, y, name=None):
     """
     if in_dynamic_or_pir_mode():
         return _C_ops.divide(x, y)
-    elif in_pir_mode():
-        return paddle._ir_ops.divide(x, y)
     else:
         return _elementwise_op(LayerHelper('elementwise_div', **locals()))
 
diff --git a/test/ir/new_ir/test_build_model.py b/test/ir/new_ir/test_build_model.py
index f356cfc24ffdf..a6ddae7c443ea 100644
--- a/test/ir/new_ir/test_build_model.py
+++ b/test/ir/new_ir/test_build_model.py
@@ -31,12 +31,12 @@ def test_basic_network(self):
             exe = paddle.static.Executor()
             x_feed = np.ones([4, 4], dtype=np.float32) * 10
             y_feed = np.ones([4, 4], dtype=np.float32) * 2
-            (sum_value,) = exe.run(
-                main_program,
-                feed={'x': x_feed, 'y': y_feed},
-                fetch_list=[sum_out],
-            )
-            self.assertEqual(sum_value, 5 * 4 * 4)
+        (sum_value,) = exe.run(
+            main_program,
+            feed={'x': x_feed, 'y': y_feed},
+            fetch_list=[sum_out],
+        )
+        self.assertEqual(sum_value, 5 * 4 * 4)
 
         main_program = paddle.static.Program()
         with paddle.static.program_guard(main_program):

From 47040ef6c6df4b95617a58636b3c13ab64112a5a Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:31:27 +0800
Subject: [PATCH 014/115] add all cast newir test (#57527)

---
 test/legacy_test/test_cast_op.py | 42 ++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
index e24eb6b44b631..47bc23d76f601 100644
--- a/test/legacy_test/test_cast_op.py
+++ b/test/legacy_test/test_cast_op.py
@@ -78,10 +78,16 @@ def setUp(self):
         self.public_python_api = cast_wrapper
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            only_check_prim=True,
+            check_new_ir=True,
+        )
 
 
 class TestCastOpFp32ToFp16(OpTest):
@@ -99,10 +105,16 @@ def setUp(self):
         self.public_python_api = cast_wrapper
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            only_check_prim=True,
+            check_new_ir=True,
+        )
 
 
 @unittest.skipIf(
@@ -128,10 +140,16 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            only_check_prim=True,
+            check_new_ir=True,
+        )
 
 
 @unittest.skipIf(
@@ -157,20 +175,28 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            only_check_prim=True,
+            check_new_ir=True,
+        )
 
 
 class TestCastOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of cast_op must be Variable.
             x1 = base.create_lod_tensor(
                 np.array([[-1]]), [[1]], base.CPUPlace()
             )
             self.assertRaises(TypeError, paddle.cast, x1, 'int32')
+        paddle.disable_static()
 
 
 class TestCastOpEager(unittest.TestCase):

From 7bf03d344d53dd45ca23611d9de342e1e95c67d5 Mon Sep 17 00:00:00 2001
From: chen2016013 <111894720+chen2016013@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:43:39 +0800
Subject: [PATCH 015/115] [PIR] Print value info on python (#57471)

* fix bug

* rewrite __str__ in value and opresult to print info

* fix bug

* change as reviewed comments

* change as reviewed comments

* fix print str
---
 paddle/fluid/pybind/ir.cc        | 37 +++++++++++++++++++++++++++++++-
 paddle/pir/core/ir_printer.cc    |  5 +++++
 paddle/pir/core/value.h          |  2 ++
 test/ir/new_ir/test_ir_pybind.py | 10 ++++++++-
 4 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 913d7d6f7aa80..22fd0f40a36b5 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
@@ -91,6 +92,20 @@ inline void SetProgramInt64Attr(std::shared_ptr<Program> program,
       attr_name, pir::Int64Attribute::get(pir::IrContext::Instance(), value));
 }
 
+std::string GetValueInfo(Value v) {
+  std::stringstream ss;
+  ss << "define_op_name=" << v.dyn_cast<OpResult>().owner()->name();
+  ss << ", index=" << v.dyn_cast<OpResult>().index();
+  ss << ", dtype=" << v.type();
+  if (v.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
+    ss << ", place="
+       << v.type()
+              .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
+              .place();
+  }
+  return ss.str();
+}
+
 void BindProgram(py::module *m) {
   py::class_<Program, std::shared_ptr<Program>> program(*m, "Program", R"DOC(
     Create Python Program. Program is an abstraction of model structure, divided into
@@ -353,7 +368,14 @@ void BindValue(py::module *m) {
              return self.impl() == other.Value::impl();
            })
       .def("__hash__",
-           [](const Value &self) { return std::hash<pir::Value>{}(self); });
+           [](const Value &self) { return std::hash<pir::Value>{}(self); })
+      .def("__str__", [](const Value &self) -> py::str {
+        std::ostringstream print_stream;
+        print_stream << "Value(";
+        print_stream << GetValueInfo(self);
+        print_stream << ")";
+        return print_stream.str();
+      });
 }
 
 void BindOpOperand(py::module *m) {
@@ -472,6 +494,19 @@ void BindOpResult(py::module *m) {
            })
       .def("__hash__",
            [](OpResult &self) { return std::hash<pir::Value>{}(self); })
+      .def("__str__",
+           [](OpResult &self) -> py::str {
+             std::ostringstream print_stream;
+             print_stream << "OpResult(";
+             print_stream << GetValueInfo(self);
+             if (GetOpResultBoolAttr(self, kAttrStopGradients)) {
+               print_stream << ", stop_gradient=True";
+             } else {
+               print_stream << ", stop_gradient=False";
+             }
+             print_stream << ")";
+             return print_stream.str();
+           })
       .def(
           "get_defining_op",
           [](const OpResult &self) -> pir::Operation * {
diff --git a/paddle/pir/core/ir_printer.cc b/paddle/pir/core/ir_printer.cc
index 52c49be812104..260d42e035e4d 100644
--- a/paddle/pir/core/ir_printer.cc
+++ b/paddle/pir/core/ir_printer.cc
@@ -317,6 +317,11 @@ void Operation::Print(std::ostream& os) {
   printer.PrintOperation(this);
 }
 
+void Value::Print(std::ostream& os) const {
+  IrPrinter printer(os);
+  printer.PrintValue(*this);
+}
+
 void Type::Print(std::ostream& os) const {
   BasicIrPrinter printer(os);
   printer.PrintType(*this);
diff --git a/paddle/pir/core/value.h b/paddle/pir/core/value.h
index 81a1717540e3d..00c7aa123746e 100644
--- a/paddle/pir/core/value.h
+++ b/paddle/pir/core/value.h
@@ -72,6 +72,8 @@ class IR_API Value {
 
   OpOperand first_use() const;
 
+  void Print(std::ostream &os) const;
+
   bool use_empty() const;
 
   bool HasOneUse() const;
diff --git a/test/ir/new_ir/test_ir_pybind.py b/test/ir/new_ir/test_ir_pybind.py
index 34aa4c90c873f..b9a6fb92ac548 100644
--- a/test/ir/new_ir/test_ir_pybind.py
+++ b/test/ir/new_ir/test_ir_pybind.py
@@ -103,6 +103,11 @@ def test_value(self):
         )
         # test value == opresult
         self.assertEqual(add_op.operands_source()[0], matmul_op.results()[0])
+        # test opresult print
+        self.assertTrue(
+            'dtype=pd_op.tensor<4x4xf32>'
+            in add_op.operands_source()[0].__str__()
+        )
         # test opresult == value
         self.assertEqual(
             add_op.operands()[0].source(), add_op.operands_source()[0]
@@ -110,10 +115,13 @@ def test_value(self):
         # test opresult == opresult
         self.assertEqual(add_op.operands()[0].source(), matmul_op.results()[0])
 
+        # test opresult print
         self.assertEqual(
             tanh_op.operands()[0].source().get_defining_op().name(), "pd_op.add"
         )
-
+        self.assertTrue(
+            'pd_op.tensor<4x4xf32>' in tanh_op.operands()[0].source().__str__()
+        )
         add_op.replace_all_uses_with(matmul_op.results())
         self.assertEqual(
             tanh_op.operands()[0].source().get_defining_op().name(),

From 3fd69fa01736459182576d5c1916766f0e287714 Mon Sep 17 00:00:00 2001
From: Ruibin Cheung <beinggod@foxmail.com>
Date: Thu, 21 Sep 2023 10:54:50 +0800
Subject: [PATCH 016/115] [NewComm] No.10 compatiable upgrade for
 distributed_fused_lamb op (#57424)

* [NewComm] No.10 compatiable upgrade for distributed_fused_lamb op

* fix
---
 .../optimizers/distributed_fused_lamb_op.cu   | 354 ++++++++++++++----
 .../phi/core/distributed/nccl_comm_context.cc |  17 +
 .../phi/core/distributed/nccl_comm_context.h  |  20 +-
 test/legacy_test/CMakeLists.txt               |   4 +-
 .../distributed_fused_lamb_test_base.py       |   5 +-
 ...est_distributed_fused_lamb_op_with_clip.py |  18 +
 ...buted_fused_lamb_op_with_gradient_merge.py |  17 +
 7 files changed, 359 insertions(+), 76 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index fdec898edbe91..a672f5ac99aa8 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -21,6 +21,7 @@
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/cuda_stream.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
@@ -28,6 +29,14 @@
 #include "paddle/phi/kernels/funcs/tensor_to_string.h"
 #include "paddle/utils/optional.h"
 
+#include "paddle/fluid/distributed/collective/utils.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
+#endif
+
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #include "math.h"  // NOLINT
@@ -48,6 +57,19 @@ using MasterT = typename phi::dtype::MPTypeTrait<T>::Type;
 using phi::funcs::FlattenToString;
 using phi::funcs::ToVector;
 
+static void CheckCommContextHasRingId(
+    const distributed::CommContextManager &comm_context_manager, int ring_id) {
+  PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                    true,
+                    paddle::platform::errors::InvalidArgument(
+                        "You choose to use new communication library by "
+                        "setting environment "
+                        "variable FLAGS_dynamic_static_unified_comm True. "
+                        "But ring_id(%d) is "
+                        "not found in comm_context_manager.",
+                        std::to_string(ring_id)));
+}
+
 template <typename T>
 static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
   static_assert(!std::is_same<T, void>::value, "T cannot be void.");
@@ -875,24 +897,68 @@ static void MultiTensorUpdateLambParamAndBetaPows(
 }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype,
-                                           ncclComm_t comm,
-                                           const void *scale,
-                                           ncclRedOp_t *op) {
+static bool CreatePreMulScaleOpIfSupported(
+    ncclDataType_t dtype,
+    ncclComm_t comm,
+    const void *scale,
+    ncclRedOp_t *op,
+    distributed::NCCLCommContext *comm_ctx = nullptr) {
 #if NCCL_VERSION_CODE >= 21100
-  int ver;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver));
-  if (ver >= 21100) {
-    VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
-        op, const_cast<void *>(scale), dtype, ncclScalarDevice, comm));
-    return true;
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_NOT_NULL(
+        comm_ctx,
+        phi::errors::InvalidArgument(
+            "You choose to use new communication library by "
+            "setting environment "
+            "variable FLAGS_dynamic_static_unified_comm True. "
+            "But parameter of comm_ctx should not be nullptr."));
+    int ver = comm_ctx->GetNcclVersion();
+    if (ver >= 21100) {
+      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
+      comm_ctx->RedOpCreatePreMulSum(
+          op, const_cast<void *>(scale), dtype, ncclScalarDevice);
+      return true;
+    }
+  } else {
+    int ver;
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver));
+    if (ver >= 21100) {
+      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
+          op, const_cast<void *>(scale), dtype, ncclScalarDevice, comm));
+      return true;
+    }
   }
 #endif
   VLOG(10) << "ncclRedOpCreatePreMulSum is not supported.";
   return false;
 }
 
+static void DestoryOpIfSupported(
+    ncclRedOp_t op,
+    ncclComm_t comm,
+    distributed::NCCLCommContext *comm_ctx = nullptr) {
+#if NCCL_VERSION_CODE >= 21100
+  VLOG(10) << "ncclRedOpDestroy starts";
+
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_NOT_NULL(
+        comm_ctx,
+        phi::errors::InvalidArgument(
+            "You choose to use new communication library by "
+            "setting environment "
+            "variable FLAGS_dynamic_static_unified_comm True. "
+            "But parameter of comm_ctx should not be nullptr."));
+    comm_ctx->RedOpDestroy(op);
+  } else {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm));
+  }
+  VLOG(10) << "ncclRedOpDestroy ends";
+
+#endif
+  VLOG(10) << "ncclRedOpDestroy is not supported.";
+}
+
 template <typename T1, typename T2>
 static void LaunchScaleKernel(const phi::GPUContext &dev_ctx,
                               const T1 *x,
@@ -922,7 +988,18 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
                                  ncclComm_t comm,
                                  gpuStream_t stream,
                                  const phi::GPUContext &dev_ctx,
+                                 distributed::NCCLCommContext *comm_ctx,
                                  const T *scale = nullptr) {
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_NOT_NULL(
+        comm_ctx,
+        phi::errors::InvalidArgument(
+            "You choose to use new communication library by "
+            "setting environment "
+            "variable FLAGS_dynamic_static_unified_comm True. "
+            "But parameter of comm_ctx should not be nullptr."));
+  }
+
   static_assert(
       std::is_same<T, float>::value || std::is_same<T, dtype::float16>::value,
       "T must be either float32 or float16.");
@@ -943,8 +1020,8 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
   ncclRedOp_t op = ncclSum;
   ncclDataType_t dtype =
       std::is_same<T, float>::value ? ncclFloat32 : ncclFloat16;
-  bool should_destroy_op =
-      scale && CreatePreMulScaleOpIfSupported(dtype, comm, scale, &op);
+  bool should_destroy_op = scale && CreatePreMulScaleOpIfSupported(
+                                        dtype, comm, scale, &op, comm_ctx);
   memory_utils::Buffer buffer(dev_ctx.GetPlace());
   if (scale && !should_destroy_op) {
     T *new_sendbuff = buffer.Alloc<T>(numel);
@@ -952,21 +1029,44 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
     sendbuff = new_sendbuff;
   }
 
-  if (UseReduceScatter) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter(
-        sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+  if (comm_ctx) {
+    // Here assume comm_ctx->GetNcclComm() have higher priority than comm
+    if (UseReduceScatter) {
+      // TODO(BeingGod): NCCLCommContext::ReduceScatter only accept DenseTensor,
+      // but sendbuff or recvbuff maybe allocated by Buffer.
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::ncclReduceScatter(sendbuff,
+                                          recvbuff,
+                                          recvcount,
+                                          dtype,
+                                          op,
+                                          comm_ctx->GetNcclComm(),
+                                          stream));
+    } else {
+      // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+      // but sendbuff or recvbuff maybe allocated by Buffer.
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::ncclAllReduce(sendbuff,
+                                      recvbuff,
+                                      recvcount,
+                                      dtype,
+                                      op,
+                                      comm_ctx->GetNcclComm(),
+                                      stream));
+    }
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-        sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+    if (UseReduceScatter) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter(
+          sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+          sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+    }
   }
 
-#if NCCL_VERSION_CODE >= 21100
   if (should_destroy_op) {
-    VLOG(10) << "ncclRedOpDestroy starts";
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm));
-    VLOG(10) << "ncclRedOpDestroy ends";
+    DestoryOpIfSupported(op, comm, comm_ctx);
   }
-#endif
 }
 
 template <typename T>
@@ -977,9 +1077,17 @@ static void NCCLReduceScatterWithScale(const T *sendbuff,
                                        ncclComm_t comm,
                                        gpuStream_t stream,
                                        const phi::GPUContext &dev_ctx,
+                                       distributed::NCCLCommContext *comm_ctx,
                                        const T *scale = nullptr) {
-  NCCLSumWithScaleBase<T, true>(
-      sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale);
+  NCCLSumWithScaleBase<T, true>(sendbuff,
+                                recvbuff,
+                                recvcount,
+                                nranks,
+                                comm,
+                                stream,
+                                dev_ctx,
+                                comm_ctx,
+                                scale);
 }
 
 template <typename T>
@@ -990,9 +1098,17 @@ static void NCCLAllReduceWithScale(const T *sendbuff,
                                    ncclComm_t comm,
                                    gpuStream_t stream,
                                    const phi::GPUContext &dev_ctx,
+                                   distributed::NCCLCommContext *comm_ctx,
                                    const T *scale = nullptr) {
-  NCCLSumWithScaleBase<T, false>(
-      sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale);
+  NCCLSumWithScaleBase<T, false>(sendbuff,
+                                 recvbuff,
+                                 recvcount,
+                                 nranks,
+                                 comm,
+                                 stream,
+                                 dev_ctx,
+                                 comm_ctx,
+                                 scale);
 }
 
 #endif
@@ -1643,26 +1759,71 @@ void DistributedFusedLambKernel(
   int64_t global_rank = 0, local_rank = 0;
   ncclComm_t global_comm = nullptr, local_comm = nullptr,
              external_comm = nullptr;
-  if (nranks > 1) {
-    auto *nccl_comm_handle =
-        paddle::platform::NCCLCommContext::Instance().Get(ring_ids[0], place);
-    global_comm = nccl_comm_handle->comm();
-    global_rank = nccl_comm_handle->rank();
+  paddle::platform::NCCLComm *nccl_comm_handle = nullptr,
+                             *local_nccl_comm_handle = nullptr;
+  distributed::NCCLCommContext *comm_ctx = nullptr, *local_comm_ctx = nullptr,
+                               *external_comm_ctx = nullptr;
+
+  const auto &comm_context_manager =
+      phi::distributed::CommContextManager::GetInstance();
+
+  if (FLAGS_dynamic_static_unified_comm) {
+    CheckCommContextHasRingId(comm_context_manager, ring_ids[0]);
+
+    comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+        comm_context_manager.Get(std::to_string(ring_ids[0])));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      paddle::platform::errors::Unavailable(
+                          "NCCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+
+    global_comm = comm_ctx->GetNcclComm();
+    global_rank = comm_ctx->GetRank();
     if (local_shard) {
-      auto *local_nccl_comm_handle =
-          paddle::platform::NCCLCommContext::Instance().Get(ring_ids[1], place);
-      local_comm = local_nccl_comm_handle->comm();
-      local_rank = local_nccl_comm_handle->rank();
+      CheckCommContextHasRingId(comm_context_manager, ring_ids[1]);
+
+      local_comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+          comm_context_manager.Get(std::to_string(ring_ids[1])));
+      local_comm = local_comm_ctx->GetNcclComm();
+      local_rank = local_comm_ctx->GetRank();
       if (use_hierarchical_allreduce) {
-        external_comm = paddle::platform::NCCLCommContext::Instance()
-                            .Get(ring_ids[2], place)
-                            ->comm();
+        CheckCommContextHasRingId(comm_context_manager, ring_ids[2]);
+
+        external_comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+            comm_context_manager.Get(std::to_string(ring_ids[2])));
+        external_comm = external_comm_ctx->GetNcclComm();
       }
     } else {
       local_comm = global_comm;
       local_rank = global_rank;
     }
+
+    VLOG(3) << "new comm_context_manager has ring_id " << ring_ids[0];
+  } else {
+    if (nranks > 1) {
+      nccl_comm_handle =
+          paddle::platform::NCCLCommContext::Instance().Get(ring_ids[0], place);
+      global_comm = nccl_comm_handle->comm();
+      global_rank = nccl_comm_handle->rank();
+      if (local_shard) {
+        local_nccl_comm_handle =
+            paddle::platform::NCCLCommContext::Instance().Get(ring_ids[1],
+                                                              place);
+        local_comm = local_nccl_comm_handle->comm();
+        local_rank = local_nccl_comm_handle->rank();
+        if (use_hierarchical_allreduce) {
+          external_comm = paddle::platform::NCCLCommContext::Instance()
+                              .Get(ring_ids[2], place)
+                              ->comm();
+        }
+      } else {
+        local_comm = global_comm;
+        local_rank = global_rank;
+      }
+    }
   }
+
   memory_utils::Buffer grad_norm_square_buffer(place);
   auto *fp32_square_grad_norm = grad_norm_square_buffer.Alloc<float>(2);
   memory_utils::Buffer cub_tmp_buffer(place);
@@ -1715,7 +1876,8 @@ void DistributedFusedLambKernel(
               num_devices,
               local_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              local_comm_ctx);
           NCCLAllReduceWithScale(
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
@@ -1723,7 +1885,8 @@ void DistributedFusedLambKernel(
               nranks / num_devices,
               external_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              external_comm_ctx);
 
           NCCLReduceScatterWithScale(
               fp16_grad_data,
@@ -1732,7 +1895,8 @@ void DistributedFusedLambKernel(
               num_devices,
               local_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              local_comm_ctx);
           NCCLAllReduceWithScale(
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1740,7 +1904,8 @@ void DistributedFusedLambKernel(
               nranks / num_devices,
               external_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              external_comm_ctx);
         } else {
           NCCLAllReduceWithScale(fp32_grad_data,
                                  fp32_sum_grad,
@@ -1748,14 +1913,16 @@ void DistributedFusedLambKernel(
                                  nranks,
                                  global_comm,
                                  stream,
-                                 dev_ctx);
+                                 dev_ctx,
+                                 comm_ctx);
           NCCLAllReduceWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel,
                                  nranks,
                                  global_comm,
                                  stream,
-                                 dev_ctx);
+                                 dev_ctx,
+                                 comm_ctx);
         }
         fp32_sum_grad += (local_rank * fp32_numel_each_device);
         fp16_sum_grad += (local_rank * fp16_numel_each_device);
@@ -1766,14 +1933,16 @@ void DistributedFusedLambKernel(
                                    nranks,
                                    global_comm,
                                    stream,
-                                   dev_ctx);
+                                   dev_ctx,
+                                   comm_ctx);
         NCCLReduceScatterWithScale(fp16_grad_data,
                                    fp16_sum_grad,
                                    fp16_numel_each_device,
                                    nranks,
                                    global_comm,
                                    stream,
-                                   dev_ctx);
+                                   dev_ctx,
+                                   comm_ctx);
       }
       // (2) Calculate the global grad norm
       GetSquareGradNorm(fp32_sum_grad,
@@ -1786,6 +1955,8 @@ void DistributedFusedLambKernel(
       VLOG(1) << "Grad square norm before all reduce: "
               << FlattenToString(fp32_square_grad_norm, 1, place);
       if (num_devices > 1) {
+        // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+        // but fp32_square_grad_norm is allocated by Buffer.
         PADDLE_ENFORCE_GPU_SUCCESS(
             phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                         fp32_square_grad_norm,
@@ -1852,6 +2023,7 @@ void DistributedFusedLambKernel(
               local_comm,
               stream,
               dev_ctx,
+              local_comm_ctx,
               fp32_scale);
           NCCLAllReduceWithScale(
               fp32_sum_grad + local_rank * fp32_numel_each_device,
@@ -1860,8 +2032,8 @@ void DistributedFusedLambKernel(
               nranks / num_devices,
               external_comm,
               stream,
-              dev_ctx);
-
+              dev_ctx,
+              external_comm_ctx);
           NCCLReduceScatterWithScale(
               fp16_grad_data,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1870,6 +2042,7 @@ void DistributedFusedLambKernel(
               local_comm,
               stream,
               dev_ctx,
+              local_comm_ctx,
               fp16_scale);
           NCCLAllReduceWithScale(
               fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1878,7 +2051,8 @@ void DistributedFusedLambKernel(
               nranks / num_devices,
               external_comm,
               stream,
-              dev_ctx);
+              dev_ctx,
+              external_comm_ctx);
         } else {
           NCCLAllReduceWithScale(fp32_grad_data,
                                  fp32_sum_grad,
@@ -1887,6 +2061,7 @@ void DistributedFusedLambKernel(
                                  global_comm,
                                  stream,
                                  dev_ctx,
+                                 comm_ctx,
                                  fp32_scale);
           NCCLAllReduceWithScale(fp16_grad_data,
                                  fp16_sum_grad,
@@ -1895,6 +2070,7 @@ void DistributedFusedLambKernel(
                                  global_comm,
                                  stream,
                                  dev_ctx,
+                                 comm_ctx,
                                  fp16_scale);
         }
         fp32_sum_grad += (local_rank * fp32_numel_each_device);
@@ -1907,6 +2083,7 @@ void DistributedFusedLambKernel(
                                    global_comm,
                                    stream,
                                    dev_ctx,
+                                   comm_ctx,
                                    fp32_scale);
         NCCLReduceScatterWithScale(fp16_grad_data,
                                    fp16_sum_grad,
@@ -1915,6 +2092,7 @@ void DistributedFusedLambKernel(
                                    global_comm,
                                    stream,
                                    dev_ctx,
+                                   comm_ctx,
                                    fp16_scale);
       }
       VLOG(1) << "FP32 HasNanInf after all reduce: "
@@ -1929,6 +2107,8 @@ void DistributedFusedLambKernel(
                          stream,
                          &cub_tmp_buffer);
       if (num_devices > 1) {
+        // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+        // but fp32_square_grad_norm is allocated by Buffer.
         PADDLE_ENFORCE_GPU_SUCCESS(
             phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                         fp32_square_grad_norm,
@@ -1954,7 +2134,8 @@ void DistributedFusedLambKernel(
             num_devices,
             local_comm,
             stream,
-            dev_ctx);
+            dev_ctx,
+            local_comm_ctx);
         NCCLAllReduceWithScale(
             fp32_sum_grad + local_rank * fp32_numel_each_device,
             fp32_sum_grad + local_rank * fp32_numel_each_device,
@@ -1962,7 +2143,8 @@ void DistributedFusedLambKernel(
             nranks / num_devices,
             external_comm,
             stream,
-            dev_ctx);
+            dev_ctx,
+            external_comm_ctx);
         NCCLReduceScatterWithScale(
             fp16_grad_data,
             fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1970,7 +2152,8 @@ void DistributedFusedLambKernel(
             num_devices,
             local_comm,
             stream,
-            dev_ctx);
+            dev_ctx,
+            local_comm_ctx);
         NCCLAllReduceWithScale(
             fp16_sum_grad + local_rank * fp16_numel_each_device,
             fp16_sum_grad + local_rank * fp16_numel_each_device,
@@ -1978,7 +2161,8 @@ void DistributedFusedLambKernel(
             nranks / num_devices,
             external_comm,
             stream,
-            dev_ctx);
+            dev_ctx,
+            external_comm_ctx);
       } else {
         NCCLAllReduceWithScale(fp32_grad_data,
                                fp32_sum_grad,
@@ -1986,14 +2170,16 @@ void DistributedFusedLambKernel(
                                nranks,
                                global_comm,
                                stream,
-                               dev_ctx);
+                               dev_ctx,
+                               comm_ctx);
         NCCLAllReduceWithScale(fp16_grad_data,
                                fp16_sum_grad,
                                fp16_numel,
                                nranks,
                                global_comm,
                                stream,
-                               dev_ctx);
+                               dev_ctx,
+                               comm_ctx);
       }
       fp32_sum_grad += (local_rank * fp32_numel_each_device);
       fp16_sum_grad += (local_rank * fp16_numel_each_device);
@@ -2004,14 +2190,16 @@ void DistributedFusedLambKernel(
                                  num_devices,
                                  global_comm,
                                  stream,
-                                 dev_ctx);
+                                 dev_ctx,
+                                 comm_ctx);
       NCCLReduceScatterWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel_each_device,
                                  num_devices,
                                  global_comm,
                                  stream,
-                                 dev_ctx);
+                                 dev_ctx,
+                                 comm_ctx);
     }
     CheckHasNanInfGrad(fp32_sum_grad,
                        fp32_numel_each_device,
@@ -2021,6 +2209,8 @@ void DistributedFusedLambKernel(
                        stream,
                        &cub_tmp_buffer);
     if (num_devices > 1) {
+      // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+      // but fp32_square_grad_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                       fp32_square_grad_norm,
@@ -2165,6 +2355,8 @@ void DistributedFusedLambKernel(
           << FlattenToString(trust_ratio_div_square_norm, param_num, place);
   if (num_devices > 1) {
     if (use_master_param_norm) {
+      // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+      // but param_square_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::ncclAllReduce(param_square_norm + fp32_global_param_num,
                                       param_square_norm + fp32_global_param_num,
@@ -2174,6 +2366,8 @@ void DistributedFusedLambKernel(
                                       local_comm,
                                       stream));
     } else {
+      // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
+      // but trust_ratio_div_square_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
           phi::dynload::ncclAllReduce(trust_ratio_div_square_norm,
                                       trust_ratio_div_square_norm,
@@ -2209,13 +2403,21 @@ void DistributedFusedLambKernel(
         beta2);
     if (num_devices > 1) {
       // ncclAllGather
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllGather(fp32_param_data + fp32_offset,
-                                      fp32_param_data,
-                                      fp32_numel_each_device,
-                                      ncclFloat32,
-                                      local_comm,
-                                      stream));
+      if (local_comm_ctx) {
+        auto send_buf = paddle::distributed::GetPartialTensor(
+            *fp32_param_out, fp32_offset, fp32_numel_each_device);
+        auto recv_buf = paddle::distributed::GetPartialTensor(
+            *fp32_param_out, 0, fp32_numel_each_device);
+        local_comm_ctx->AllGather(&recv_buf, send_buf, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclAllGather(fp32_param_data + fp32_offset,
+                                        fp32_param_data,
+                                        fp32_numel_each_device,
+                                        ncclFloat32,
+                                        local_comm,
+                                        stream));
+      }
     }
 
     beta1_pow_data = nullptr;
@@ -2239,13 +2441,21 @@ void DistributedFusedLambKernel(
         beta2);
     if (num_devices > 1) {
       // ncclAllGather
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllGather(fp16_param_data + fp16_offset,
-                                      fp16_param_data,
-                                      fp16_numel_each_device,
-                                      ncclFloat16,
-                                      local_comm,
-                                      stream));
+      if (local_comm_ctx) {
+        auto send_buf = paddle::distributed::GetPartialTensor(
+            *fp16_param_out, fp16_offset, fp16_numel_each_device);
+        auto recv_buf = paddle::distributed::GetPartialTensor(
+            *fp16_param_out, 0, fp16_numel_each_device);
+        local_comm_ctx->AllGather(&recv_buf, send_buf, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclAllGather(fp16_param_data + fp16_offset,
+                                        fp16_param_data,
+                                        fp16_numel_each_device,
+                                        ncclFloat16,
+                                        local_comm,
+                                        stream));
+      }
     }
   }
   VLOG(10) << "Update Param done";
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index 90b6a4c447c92..bd49f0cff1708 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -33,8 +33,11 @@ NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id)
     : CommContext(rank, size) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&nccl_version_));
 }
 
+int NCCLCommContext::GetNcclVersion() { return nccl_version_; }
+
 ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
 
 gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); }
@@ -228,5 +231,19 @@ void NCCLCommContext::GroupStart() {
 }
 void NCCLCommContext::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); }
 
+#if NCCL_VERSION_CODE >= 21100
+void NCCLCommContext::RedOpCreatePreMulSum(ncclRedOp_t* op,
+                                           void* scalar,
+                                           ncclDataType_t dtype,
+                                           ncclScalarResidence_t residence) {
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
+      op, scalar, dtype, residence, nccl_comm_));
+}
+
+void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) {
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, nccl_comm_));
+}
+#endif
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h
index fdd45793a6387..b9fdce02f4b5f 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.h
+++ b/paddle/phi/core/distributed/nccl_comm_context.h
@@ -40,7 +40,9 @@ namespace distributed {
 class NCCLCommContext final : public CommContext {
  public:
   NCCLCommContext(int rank, int size, ncclUniqueId nccl_id);
-  ~NCCLCommContext() {}
+  ~NCCLCommContext() override = default;
+
+  int GetNcclVersion();
 
   ncclComm_t GetNcclComm();
 
@@ -65,6 +67,7 @@ class NCCLCommContext final : public CommContext {
                  const phi::DenseTensor& in_tensor,
                  int root,
                  gpuStream_t stream);
+
   void Send(const phi::DenseTensor& in_tensor,
             const int64_t& count,
             const int& peer,
@@ -99,9 +102,24 @@ class NCCLCommContext final : public CommContext {
 
   void GroupEnd();
 
+#if NCCL_VERSION_CODE >= 21100
+  // Creates a new reduction operator which pre-multiplies input values by a
+  // given scalar locally before reducing them with peer values via summation.
+  void RedOpCreatePreMulSum(ncclRedOp_t* op,
+                            void* scalar,
+                            ncclDataType_t dtype,
+                            ncclScalarResidence_t residence);
+
+  // Destroys the reduction operator op. The operator must have been created by
+  // ncclRedOpCreatePreMul with the matching communicator comm.
+  void RedOpDestroy(ncclRedOp_t op);
+#endif
+
  private:
   DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
 
+  int nccl_version_;
+
   ncclComm_t nccl_comm_;
 
   std::unique_ptr<phi::GPUContext> dev_ctx_;
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 9e7adef0a634f..e6a060c7369a9 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -1013,11 +1013,11 @@ set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT
-                                                                         120)
+                                                                         240)
 set_tests_properties(test_distributed_fused_lamb_op_without_clip
                      PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge
-                     PROPERTIES TIMEOUT 120)
+                     PROPERTIES TIMEOUT 240)
 set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
diff --git a/test/legacy_test/distributed_fused_lamb_test_base.py b/test/legacy_test/distributed_fused_lamb_test_base.py
index baffc7dd5e546..ea011becc9090 100644
--- a/test/legacy_test/distributed_fused_lamb_test_base.py
+++ b/test/legacy_test/distributed_fused_lamb_test_base.py
@@ -270,7 +270,10 @@ def setUpClass(cls):
         paddle.enable_static()
         paddle.set_flags({'FLAGS_cudnn_deterministic': True})
         _clip_by_global_norm_using_mp_type(True)
-        fleet.init(role_maker=get_role_maker())
+        if os.environ.get("FLAGS_dynamic_static_unified_comm") == "1":
+            paddle.distributed.collective._init_parallel_env("nccl")
+        else:
+            fleet.init(role_maker=get_role_maker())
 
     def config(self):
         clip_after_allreduce = bool(
diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
index 671e11e7702fe..32ee6fd8b3958 100644
--- a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
+++ b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
@@ -41,6 +41,7 @@ def run_test(
     max_global_norm=-1.0,
     gradient_merge_steps=1,
     use_master_acc_grad=True,
+    need_env={},
 ):
     temp_dir = tempfile.TemporaryDirectory()
     if not paddle.is_compiled_with_cuda():
@@ -54,6 +55,8 @@ def run_test(
         '-u',
         '-m',
         'paddle.distributed.launch',
+        '--devices',
+        '0,1',
         '--log_dir',
         log_dir,
         get_test_file(),
@@ -65,6 +68,7 @@ def run_test(
     os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
     os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
     os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0)
+    os.environ.update(need_env)
 
     touch_file_env = 'SUCCESS_TOUCH_FILE'
     touch_file_name = os.path.join(
@@ -87,6 +91,20 @@ def test_1(self):
     def test_2(self):
         run_test(clip_after_allreduce=False, max_global_norm=0.01)
 
+    def test_1_new_comm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=0.01,
+            need_env={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
+    def test_2_new_comm(self):
+        run_test(
+            clip_after_allreduce=False,
+            max_global_norm=0.01,
+            need_env={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py b/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py
index 0c7096f5dae1a..f236be3a8d150 100644
--- a/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py
+++ b/test/legacy_test/test_distributed_fused_lamb_op_with_gradient_merge.py
@@ -33,6 +33,23 @@ def test_gm_with_fp16_acc_grad(self):
             use_master_acc_grad=False,
         )
 
+    def test_gm_new_comm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=-1.0,
+            gradient_merge_steps=2,
+            need_env={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
+    def test_gm_with_fp16_acc_grad_new_comm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=-1.0,
+            gradient_merge_steps=2,
+            use_master_acc_grad=False,
+            need_env={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From 892dee35a525f1c752f2cbeff1a72df38b569155 Mon Sep 17 00:00:00 2001
From: iLeGend <824040212@qq.com>
Date: Thu, 21 Sep 2023 10:56:43 +0800
Subject: [PATCH 017/115] [NewComm] No.2 compatiable upgrade for partial_recv
 op (#57548)

* [NewComm] No.2 compatiable upgrade for partial_recv op

* fix

* add header

* fix typo
---
 .../collective/partial_recv_op.cu.cc          | 91 +++++++++++++++----
 1 file changed, 74 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index 0c33ca7c25c32..2a6aea1c7a13a 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -18,15 +18,21 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#include "paddle/fluid/distributed/collective/utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+
 namespace paddle {
 namespace operators {
 
 template <typename T, typename DeviceContext>
 class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
 #if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
     NCCL_VERSION_CODE >= 2703
     auto out = ctx.Output<phi::DenseTensor>("Out");
@@ -74,35 +80,86 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
     auto map = distributed::ProcessGroupMapFromGid::getInstance();
     if (map->has(rid)) {
       // Use ProcessGroup
-      distributed::ProcessGroup *pg = map->get(rid);
+      distributed::ProcessGroup* pg = map->get(rid);
       auto task = pg->Recv(out, peer, offset, recv_numel, /*sync_op*/ true);
       task->Wait();
     } else {
       gpuStream_t stream = nullptr;
-      auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+      platform::NCCLComm* comm = nullptr;
+      phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+
+      int nranks = 0;
+      int rank = 0;
+
+      const auto& comm_context_manager =
+          phi::distributed::CommContextManager::GetInstance();
+
+      if (FLAGS_dynamic_static_unified_comm) {
+        // Use New Communication Library
+        PADDLE_ENFORCE_EQ(
+            comm_context_manager.Has(std::to_string(rid)),
+            true,
+            platform::errors::InvalidArgument(
+                "You choose to use new communication library by "
+                "setting environment "
+                "variable FLAGS_dynamic_static_unified_comm True. "
+                "But ring_id(%d) is "
+                "not found in comm_context_manager.",
+                std::to_string(rid)));
+        comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+            comm_context_manager.Get(std::to_string(rid)));
+        PADDLE_ENFORCE_NE(
+            comm_ctx,
+            nullptr,
+            platform::errors::Unavailable(
+                "NCCLCommContext is nullptr, collective op should "
+                "has ring_id attr."));
+
+        stream = comm_ctx->GetStream();
+        nranks = comm_ctx->GetSize();
+        rank = comm_ctx->GetRank();
+
+        VLOG(3) << "new comm_context_manager has ring_id " << rid;
+      } else {
+        comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+        stream = comm->stream();
+        nranks = comm->nranks();
+        rank = comm->rank();
+
+        VLOG(3) << "old NCCLCommContext has ring_id" << rid;
+      }
+
       if (ctx.Attr<bool>("use_calc_stream")) {
         // should ExecutionContext for calc stream.
         stream = ctx.cuda_device_context().stream();
-      } else {
-        stream = comm->stream();
       }
+
       PADDLE_ENFORCE_LT(peer,
-                        comm->nranks(),
+                        nranks,
                         platform::errors::InvalidArgument(
                             "The value of peer (%d) you set must "
-                            "be less than comm->nranks (%d).",
+                            "be less than nranks (%d).",
                             peer,
-                            comm->nranks()));
+                            nranks));
+
       ncclDataType_t dtype = platform::ToNCCLDataType(type);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclRecv(out->data<T>() + offset,
-                                      recv_numel,
-                                      dtype,
-                                      peer,
-                                      comm->comm(),
-                                      stream));
-      VLOG(3) << "rank " << comm->rank() << " recv " << recv_numel
-              << " from offset[" << offset << "] from " << peer;
+
+      if (comm_ctx) {
+        auto recv_buf = distributed::GetPartialTensor(*out, offset, recv_numel);
+
+        comm_ctx->Recv(&recv_buf, recv_numel, peer, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::ncclRecv(out->data<T>() + offset,
+                                        recv_numel,
+                                        dtype,
+                                        peer,
+                                        comm->comm(),
+                                        stream));
+      }
+      VLOG(3) << "rank " << rank << " recv " << recv_numel << " from offset["
+              << offset << "] from " << peer;
     }
 #else
     PADDLE_THROW(platform::errors::Unavailable(

From 431a791a2c7626dcc669efba9bd77a880c625123 Mon Sep 17 00:00:00 2001
From: xuxinyi389 <104957571+xuxinyi389@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:56:53 +0800
Subject: [PATCH 018/115] Enhanced tuple support I (#57469)

* bugs_fix:tuple_support

* bug_fixes

* bug_fixes

* bug_fixes

* bug_fixes

* bug_fixes

* bug_fixes
---
 python/paddle/nn/functional/common.py | 30 +++++----
 python/paddle/nn/layer/common.py      |  8 +--
 python/paddle/vision/ops.py           | 26 +++++---
 test/legacy_test/test_box_coder_op.py | 92 +++++++++++++++++++++------
 test/legacy_test/test_min_op.py       |  9 +++
 test/legacy_test/test_unfold_op.py    | 11 ++++
 6 files changed, 133 insertions(+), 43 deletions(-)

diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 5ef8e40d921b6..9b1da0dd36802 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -69,19 +69,19 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     Parameters:
         x(Tensor):              4-D Tensor, input tensor of format [N, C, H, W],
                                   data type can be float32 or float64
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
-        strides(int|list, optional):        The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple, optional):        The strides, should be [stride_h, stride_w]
                                   or an integer stride treated as [sride, stride].
                                   For default, strides will be [1, 1].
-        paddings(int|list, optional):       The paddings of each dimension, should be
+        paddings(int|list|tuple, optional):       The paddings of each dimension, should be
                                   [padding_top, padding_left, padding_bottom, padding_right]
                                   or [padding_h, padding_w] or an integer padding.
                                   If [padding_h, padding_w] was given, it will expanded to
                                   [padding_h, padding_w, padding_h, padding_w]. If an integer
                                   padding was given, [padding, padding, padding, padding] will
                                   be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list, optional):      the dilations of convolution kernel, should be
+        dilations(int|list|tuple, optional):      the dilations of convolution kernel, should be
                                   [dilation_h, dilation_w], or an integer dilation treated as
                                   [dilation, dilation]. For default, it will be [1, 1].
         name(str, optional): The default value is None.
@@ -116,38 +116,42 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert isinstance(kernel_sizes, list) and (
+        assert isinstance(kernel_sizes, (list, tuple)) and (
             len(kernel_sizes) == 2
-        ), "kernel_sizes should either be an integer or a list of two integers"
+        ), "kernel_sizes should either be an integer or a list/tuple of two integers"
+        kernel_sizes = list(kernel_sizes)
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert isinstance(strides, list) and (
+        assert isinstance(strides, (list, tuple)) and (
             len(strides) == 2
-        ), "strides should either be an integer or a list of two integers"
+        ), "strides should either be an integer or a list/tuple of two integers"
+        strides = list(strides)
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert isinstance(dilations, list) and (
+        assert isinstance(dilations, (list, tuple)) and (
             len(dilations) == 2
-        ), "dilations should either be an integer or a list of two integers"
+        ), "dilations should either be an integer or a list/tuple of two integers"
+        dilations = list(dilations)
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
-    elif isinstance(paddings, list):
+    elif isinstance(paddings, (list, tuple)):
+        paddings = list(paddings)
         if len(paddings) == 2:
             paddings = paddings * 2
         elif len(paddings) == 4:
             pass
         else:
             raise ValueError(
-                "paddings should either be an integer or a list of 2 or 4 integers"
+                "paddings should either be an integer or a list/tuple of 2 or 4 integers"
             )
     else:
         raise ValueError(
-            "Unexpected type of paddings, it should be either an integer or a list"
+            "Unexpected type of paddings, it should be either an integer or a list/tuple"
             "of 2 or 4 integers"
         )
 
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index db11591db5fe7..0c55895d21253 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1551,17 +1551,17 @@ class Unfold(Layer):
 
 
     Parameters:
-        kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w]
+        kernel_sizes(int|list|tuple): The size of convolution kernel, should be [k_h, k_w]
             or an integer k treated as [k, k].
-        strides(int|list, optional): The strides, should be [stride_h, stride_w]
+        strides(int|list|tuple, optional): The strides, should be [stride_h, stride_w]
             or an integer stride treated as [sride, stride]. For default, strides will be [1, 1].
-        paddings(int|list, optional): The paddings of each dimension, should be
+        paddings(int|list|tuple, optional): The paddings of each dimension, should be
             [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w]
             or an integer padding. If [padding_h, padding_w] was given, it will expanded to
             [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given,
             [padding, padding, padding, padding] will be used. For default,
             paddings will be [0, 0, 0, 0].
-        dilations(int|list, optional): The dilations of convolution kernel, should be
+        dilations(int|list|tuple, optional): The dilations of convolution kernel, should be
             [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation].
             For default, it will be [1, 1].
         name(str, optional): The default value is None. Normally there is no need for user to
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 677fd7602bcfa..d38f81a57ede9 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -616,10 +616,10 @@ def box_coder(
             left top coordinate of the anchor box, if the input is image feature
             map, they are close to the origin of the coordinate system.
             [xmax, ymax] is the right bottom coordinate of the anchor box.
-        prior_box_var (List|Tensor|None): prior_box_var supports three types
+        prior_box_var (Tensor|List|tuple|None): prior_box_var supports four types
             of input. One is Tensor with shape [M, 4] which holds M group and
-            data type is float32 or float64. The second is list consist of
-            4 elements shared by all boxes and data type is float32 or float64.
+            data type is float32 or float64. The second is list or tuple consist
+            of 4 elements shared by all boxes and data type is float32 or float64.
             Other is None and not involved in calculation.
         target_box (Tensor): This input can be a 2-D LoDTensor with shape
             [N, 4] when code_type is 'encode_center_size'. This input also can
@@ -685,7 +685,11 @@ def box_coder(
                 axis,
                 [],
             )
-        elif isinstance(prior_box_var, list):
+        elif isinstance(prior_box_var, (list, tuple)):
+            prior_box_var = list(prior_box_var)
+            assert (
+                len(prior_box_var) == 4
+            ), "Input prior_box_var must be Variable or list|tuple with 4 elements."
             output_box = _C_ops.box_coder(
                 prior_box,
                 None,
@@ -696,9 +700,10 @@ def box_coder(
                 prior_box_var,
             )
         else:
-            raise TypeError("Input prior_box_var must be Variable or list")
+            raise TypeError(
+                "Input prior_box_var must be Variable or list|tuple"
+            )
         return output_box
-
     else:
         check_variable_and_dtype(
             prior_box, 'prior_box', ['float32', 'float64'], 'box_coder'
@@ -720,10 +725,15 @@ def box_coder(
         }
         if isinstance(prior_box_var, Variable):
             inputs['PriorBoxVar'] = prior_box_var
-        elif isinstance(prior_box_var, list):
+        elif isinstance(prior_box_var, (list, tuple)):
             attrs['variance'] = prior_box_var
+            assert (
+                len(attrs['variance']) == 4
+            ), "Input prior_box_var must be Variable or list|tuple with 4 elements."
         else:
-            raise TypeError("Input prior_box_var must be Variable or list")
+            raise TypeError(
+                "Input prior_box_var must be Variable or list|tuple"
+            )
         helper.append_op(
             type="box_coder",
             inputs=inputs,
diff --git a/test/legacy_test/test_box_coder_op.py b/test/legacy_test/test_box_coder_op.py
index 7221fb2ba73f6..72ef401aa5fb7 100644
--- a/test/legacy_test/test_box_coder_op.py
+++ b/test/legacy_test/test_box_coder_op.py
@@ -372,27 +372,30 @@ def setUp(self):
 
     def test_dygraph_with_static(self):
         paddle.enable_static()
-        prior_box = paddle.static.data(
-            name='prior_box', shape=[80, 4], dtype='float32'
-        )
-        prior_box_var = paddle.static.data(
-            name='prior_box_var', shape=[80, 4], dtype='float32'
-        )
-        target_box = paddle.static.data(
-            name='target_box', shape=[20, 80, 4], dtype='float32'
-        )
+        exe = paddle.static.Executor()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            prior_box = paddle.static.data(
+                name='prior_box', shape=[80, 4], dtype='float32'
+            )
+            prior_box_var = paddle.static.data(
+                name='prior_box_var', shape=[80, 4], dtype='float32'
+            )
+            target_box = paddle.static.data(
+                name='target_box', shape=[20, 80, 4], dtype='float32'
+            )
 
-        boxes = paddle.vision.ops.box_coder(
-            prior_box=prior_box,
-            prior_box_var=prior_box_var,
-            target_box=target_box,
-            code_type="decode_center_size",
-            box_normalized=False,
-        )
+            boxes = paddle.vision.ops.box_coder(
+                prior_box=prior_box,
+                prior_box_var=prior_box_var,
+                target_box=target_box,
+                code_type="decode_center_size",
+                box_normalized=False,
+            )
 
-        exe = paddle.static.Executor()
         boxes_np = exe.run(
-            paddle.static.default_main_program(),
+            main,
             feed={
                 'prior_box': self.prior_box_np,
                 'prior_box_var': self.prior_box_var_np,
@@ -419,6 +422,59 @@ def test_dygraph_with_static(self):
         paddle.enable_static()
 
 
+class TestBoxCoderSupporttuple(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(678)
+        self.prior_box_np = np.random.random((80, 4)).astype('float32')
+        self.target_box_np = np.random.random((20, 80, 4)).astype('float32')
+
+    def test_support_tuple(self):
+        paddle.enable_static()
+        exe = paddle.static.Executor()
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            prior_box = paddle.static.data(
+                name='prior_box', shape=[80, 4], dtype='float32'
+            )
+            target_box = paddle.static.data(
+                name='target_box', shape=[20, 80, 4], dtype='float32'
+            )
+
+            boxes = paddle.vision.ops.box_coder(
+                prior_box=prior_box,
+                prior_box_var=(1, 2, 3, 4),
+                target_box=target_box,
+                code_type="decode_center_size",
+                box_normalized=False,
+            )
+
+        boxes_np = exe.run(
+            main,
+            feed={
+                'prior_box': self.prior_box_np,
+                'target_box': self.target_box_np,
+            },
+            fetch_list=[boxes],
+        )[0]
+
+        paddle.disable_static()
+        prior_box_dy = paddle.to_tensor(self.prior_box_np)
+        target_box_dy = paddle.to_tensor(self.target_box_np)
+
+        boxes_dy = paddle.vision.ops.box_coder(
+            prior_box=prior_box_dy,
+            prior_box_var=(1, 2, 3, 4),
+            target_box=target_box_dy,
+            code_type="decode_center_size",
+            box_normalized=False,
+        )
+        boxes_dy_np = boxes_dy.numpy()
+
+        np.testing.assert_allclose(boxes_np, boxes_dy_np)
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/test/legacy_test/test_min_op.py b/test/legacy_test/test_min_op.py
index 7de7108d7d1ad..e24471b20dca8 100644
--- a/test/legacy_test/test_min_op.py
+++ b/test/legacy_test/test_min_op.py
@@ -83,6 +83,15 @@ def test_imperative_api(self):
         z_expected = np.array(np.min(np_x, axis=0))
         self.assertEqual((np_z == z_expected).all(), True)
 
+    def test_support_tuple(self):
+        paddle.disable_static()
+        np_x = np.array([10, 10]).astype('float64')
+        x = paddle.to_tensor(np_x)
+        z = paddle.min(x, axis=(0,))
+        np_z = z.numpy()
+        z_expected = np.array(np.min(np_x, axis=0))
+        self.assertEqual((np_z == z_expected).all(), True)
+
 
 class TestOutDtype(unittest.TestCase):
     def test_min(self):
diff --git a/test/legacy_test/test_unfold_op.py b/test/legacy_test/test_unfold_op.py
index 8a7f2aaf199f3..ef8174256e5cb 100644
--- a/test/legacy_test/test_unfold_op.py
+++ b/test/legacy_test/test_unfold_op.py
@@ -144,6 +144,17 @@ def test_check_output(self):
     def test_check_grad(self):
         self.check_grad(['X'], 'Y')
 
+    def test_support_tuple(self):
+        paddle.disable_static()
+        x = paddle.randn((10, 3, 64, 64))
+        paddle.nn.functional.unfold(x, 3, (1, 1), 1, 1)
+        paddle.nn.functional.unfold(x, 3, 1, (1, 1), 1)
+        paddle.nn.functional.unfold(x, 3, 1, 1, (1, 1))
+        out1 = paddle.nn.functional.unfold(x, 3, (1, 1), (1, 1), (1, 1))
+        out2 = paddle.nn.functional.unfold(x, (3, 3), (1, 1), (1, 1), (1, 1))
+        self.assertTrue(np.allclose(out1.numpy(), out2.numpy()))
+        paddle.enable_static()
+
 
 class TestUnfoldFP16Op(TestUnfoldOp):
     def init_dtype(self):

From 20893b0b10df7602c597fcfc920eaec015701860 Mon Sep 17 00:00:00 2001
From: Ligoml <39876205+Ligoml@users.noreply.github.com>
Date: Thu, 21 Sep 2023 10:57:05 +0800
Subject: [PATCH 019/115] Update CI api_docs_approval (#57542)

* Don't Merge

* make conflict

* reset

* updata check_api_approvals.sh
---
 tools/check_api_approvals.sh | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 3989a0cceff1b..5f05b3cf6f080 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -43,22 +43,18 @@ api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/flu
 if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then
     echo_line="You must have one RD (XiaoguangHu01, jeff41404, lanxianghit or qingqing01) approval for API change.\n"
     echo_line="${echo_line} and one TPM approval for API change: \n"
-    echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general APIs.\n"
-    echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n"
-    echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n"
+    echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, Ligoml/LiMengLiu for general APIs.\n"
 
     check_approval 1 XiaoguangHu01 jeff41404 lanxianghit qingqing01
-    check_approval 1 jzhang533 sunzhongkai588 dingjiaweiww Ligoml liuTINA0907 leiqing1
+    check_approval 1 jzhang533 sunzhongkai588 Ligoml
 fi
 
 api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc`
 if [ "$api_doc_spec_diff" != "" ]; then
     echo_line="You must have  one TPM approval for API documents change: \n"
-    echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general API docs.\n"
-    echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related API docs.\n"
-    echo_line="${echo_line} leiqing1/LeiQing for inference related API docs.\n"
+    echo_line="${echo_line} jzhang533/ZhangJun, sunzhongkai588/SunZhongKai, Ligoml/LiMengLiu for general API docs.\n"
 
-    check_approval 1 jzhang533 sunzhongkai588 dingjiaweiww Ligoml liuTINA0907 leiqing1
+    check_approval 1 jzhang533 sunzhongkai588 Ligoml
 fi
 
 api_yaml_diff=`python ${PADDLE_ROOT}/tools/check_api_yaml_same.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec ${BRANCH} ${PADDLE_ROOT}`

From 69fa09a223fbcbd668099d425655f141dc5c1883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Thu, 21 Sep 2023 10:59:19 +0800
Subject: [PATCH 020/115] add API for ir_compare and move it from namespace
 optim to ir_utils (#57531)

---
 .../auto_schedule/search_space/search_state.cc |  7 +++----
 .../auto_schedule/search_space/search_state.h  |  4 ++--
 paddle/cinn/ir/test/ir_compare_test.cc         | 18 +++++++-----------
 paddle/cinn/ir/utils/ir_compare.cc             |  8 ++++++++
 paddle/cinn/ir/utils/ir_compare.h              |  6 ++++++
 paddle/cinn/ir/utils/ir_visitor.cc             |  3 +--
 6 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/paddle/cinn/auto_schedule/search_space/search_state.cc b/paddle/cinn/auto_schedule/search_space/search_state.cc
index 96ace0f505d7f..c16bf62840291 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state.cc
@@ -133,11 +133,10 @@ bool SearchStateEqual::operator()(const SearchState& lhs,
   // compare exprs size firstly
   if (lhs_exprs.size() != rhs_exprs.size()) return false;
 
-  // compare every expr one by one with ir::IrEqualVisitor
+  // compare every expr one by one with ir::ir_utils::IrEqualVisitor
   for (int i = 0; i < lhs_exprs.size(); ++i) {
-    ir::IrEqualVisitor compartor(
-        /*allow_name_suffix_diff=*/true);  // ignore suffix difference in name
-    if (!compartor.Compare(lhs_exprs[i], rhs_exprs[i])) return false;
+    if (!ir::ir_utils::IRCompare(lhs_exprs[i], rhs_exprs[i], true))
+      return false;
   }
   return true;
 }
diff --git a/paddle/cinn/auto_schedule/search_space/search_state.h b/paddle/cinn/auto_schedule/search_space/search_state.h
index 7991fb9540188..b3f45c5cd746c 100644
--- a/paddle/cinn/auto_schedule/search_space/search_state.h
+++ b/paddle/cinn/auto_schedule/search_space/search_state.h
@@ -70,8 +70,8 @@ struct SearchStateHash {
   size_t operator()(const SearchState& s) const;
 };
 
-// SearchStateHash equal functor, use ir::IrEqualVisitor to compare their AST
-// struct and fields
+// SearchStateHash equal functor, use ir::ir_utils::IrEqualVisitor to compare
+// their AST struct and fields
 struct SearchStateEqual {
   bool operator()(const SearchState& lhs, const SearchState& rhs) const;
 };
diff --git a/paddle/cinn/ir/test/ir_compare_test.cc b/paddle/cinn/ir/test/ir_compare_test.cc
index a1bca0cd5373f..cc9ce438221a2 100644
--- a/paddle/cinn/ir/test/ir_compare_test.cc
+++ b/paddle/cinn/ir/test/ir_compare_test.cc
@@ -23,7 +23,7 @@
 
 namespace cinn {
 namespace ir {
-
+namespace ir_utils {
 TEST(TestIrCompare, SingleFunction) {
   Target target = common::DefaultHostTarget();
 
@@ -128,20 +128,16 @@ TEST(TestIrCompare, SingleFunction) {
   ASSERT_EQ(func2_str, utils::GetStreamCnt(funcs_2.front()));
   ASSERT_EQ(func3_str, utils::GetStreamCnt(funcs_3.front()));
 
-  IrEqualVisitor compartor;
   // they are different at the name of root ScheduleBlock
-  ASSERT_TRUE(compartor.Compare(funcs_1.front(), funcs_2.front()));
+  ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_2.front()));
   // compare with itself
-  ASSERT_TRUE(compartor.Compare(funcs_1.front(), funcs_1.front()));
-  IrEqualVisitor compartor_allow_suffix_diff(true);
+  ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_1.front()));
   // they are euqal if allowing suffix of name different
-  ASSERT_TRUE(
-      compartor_allow_suffix_diff.Compare(funcs_1.front(), funcs_2.front()));
+  ASSERT_TRUE(IRCompare(funcs_1.front(), funcs_2.front(), true));
 
-  ASSERT_FALSE(compartor.Compare(funcs_1.front(), funcs_3.front()));
-  ASSERT_FALSE(
-      compartor_allow_suffix_diff.Compare(funcs_1.front(), funcs_3.front()));
+  ASSERT_FALSE(IRCompare(funcs_1.front(), funcs_3.front()));
+  ASSERT_FALSE(IRCompare(funcs_1.front(), funcs_3.front(), true));
 }
-
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_compare.cc b/paddle/cinn/ir/utils/ir_compare.cc
index c303262d04fbd..87324be608048 100644
--- a/paddle/cinn/ir/utils/ir_compare.cc
+++ b/paddle/cinn/ir/utils/ir_compare.cc
@@ -22,6 +22,8 @@
 namespace cinn {
 namespace ir {
 
+namespace ir_utils {
+
 bool IrEqualVisitor::Compare(const Expr& lhs, const Expr& rhs) {
   if (lhs.get() == rhs.get()) {  // the same object, including both are null
     return true;
@@ -358,5 +360,11 @@ bool IrEqualVisitor::Visit(const ScheduleBlockRealize* lhs, const Expr* other) {
          Compare(lhs->schedule_block, rhs->schedule_block);
 }
 
+bool IRCompare(const Expr& lhs, const Expr& rhs, bool allow_name_suffix_diff) {
+  IrEqualVisitor ir_equal_visitor(allow_name_suffix_diff);
+  return ir_equal_visitor.Compare(lhs, rhs);
+}
+
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_compare.h b/paddle/cinn/ir/utils/ir_compare.h
index 9e4b335857b98..d41e6db0441a7 100644
--- a/paddle/cinn/ir/utils/ir_compare.h
+++ b/paddle/cinn/ir/utils/ir_compare.h
@@ -20,6 +20,7 @@
 
 namespace cinn {
 namespace ir {
+namespace ir_utils {
 
 // Determine whether two ir AST trees are euqal by comparing their struct and
 // fields of each node through dfs visitor
@@ -47,5 +48,10 @@ class IrEqualVisitor : public IRVisitorRequireReImpl<bool, const Expr*> {
   bool allow_name_suffix_diff_ = false;
 };
 
+bool IRCompare(const Expr& lhs,
+               const Expr& rhs,
+               bool allow_name_suffix_diff = false);
+
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_visitor.cc b/paddle/cinn/ir/utils/ir_visitor.cc
index 9ef6a78df1fcd..f55259be2c641 100644
--- a/paddle/cinn/ir/utils/ir_visitor.cc
+++ b/paddle/cinn/ir/utils/ir_visitor.cc
@@ -23,8 +23,7 @@ namespace ir {
 
 bool operator==(Expr a, Expr b) {
   if (a.get() == b.get()) return true;
-  IrEqualVisitor cmp;
-  return cmp.Compare(a, b);
+  return ir_utils::IRCompare(a, b);
 }
 
 bool operator!=(Expr a, Expr b) { return !(a == b); }

From b718b1be52e67f72974de7db42fc0fecf070ac18 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 21 Sep 2023 11:00:31 +0800
Subject: [PATCH 021/115] pir support pixel unshuffle op (#57521)

---
 paddle/fluid/operators/pixel_unshuffle_op.cc | 105 -------------------
 paddle/phi/api/yaml/backward.yaml            |   9 ++
 paddle/phi/api/yaml/op_compat.yaml           |   7 ++
 paddle/phi/api/yaml/ops.yaml                 |   9 ++
 paddle/phi/ops/compat/pixel_unshuffle_sig.cc |  30 ------
 test/legacy_test/test_pixel_unshuffle.py     |   4 +-
 6 files changed, 27 insertions(+), 137 deletions(-)
 delete mode 100644 paddle/fluid/operators/pixel_unshuffle_op.cc
 delete mode 100644 paddle/phi/ops/compat/pixel_unshuffle_sig.cc

diff --git a/paddle/fluid/operators/pixel_unshuffle_op.cc b/paddle/fluid/operators/pixel_unshuffle_op.cc
deleted file mode 100644
index 52b7452d7a8cc..0000000000000
--- a/paddle/fluid/operators/pixel_unshuffle_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class PixelUnshuffleOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class PixelUnshuffleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>), "
-             "the input feature data of PixelUnshuffleOp, the layout is "
-             "[N, C, H, W] or [N, H, W, C].");
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>), the output of "
-              "PixelUnshuffleOp. The layout is [N, C*factor^2, H/factor, "
-              "W/factor] or [N, H/factor, W/factor, C*factor^2].");
-    AddAttr<int>("downscale_factor",
-                 "the factor to decrease spatial resolution by.")
-        .SetDefault(1);
-    AddAttr<std::string>(
-        "data_format",
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\", Specify the data format of the input data.")
-        .SetDefault("NCHW");
-
-    AddComment(R"DOC(
-    Pixel Unshuffle operator
-    This operator rearranges elements in a tensor of shape :math:`(*, C, H, W)`
-    to a tensor of shape :math:`(*, C\times r^2, H / r, W / r)`.
-
-    This operation is the reversion of PixelShuffle operation.
-
-    Please refer to the paper:
-    `Real-Time Single Image and Video Super-Resolution Using an Efficient
-    Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
-    by Shi et. al (2016) for more details.
-    )DOC");
-  }
-};
-
-template <typename T>
-class PixelUnshuffleGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("pixel_unshuffle_grad");
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-class PixelUnshuffleGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(pixel_unshuffle,
-                            PixelUnshuffleInferShapeFunctor,
-                            PD_INFER_META(phi::PixelUnshuffleInferMeta));
-
-REGISTER_OPERATOR(pixel_unshuffle,
-                  ops::PixelUnshuffleOp,
-                  ops::PixelUnshuffleOpMaker,
-                  ops::PixelUnshuffleGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PixelUnshuffleGradOpMaker<paddle::imperative::OpBase>,
-                  PixelUnshuffleInferShapeFunctor);
-
-DECLARE_INFER_SHAPE_FUNCTOR(pixel_unshuffle_grad,
-                            PixelUnshuffleGradInferShapeFunctor,
-                            PD_INFER_META(phi::PixelUnshuffleGradInferMeta));
-
-REGISTER_OPERATOR(pixel_unshuffle_grad,
-                  ops::PixelUnshuffleGradOp,
-                  PixelUnshuffleGradInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index b6eeb5e07005c..2f48bb80478e6 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1646,6 +1646,15 @@
   kernel :
     func : pixel_shuffle_grad
 
+- backward_op : pixel_unshuffle_grad
+  forward : pixel_unshuffle (Tensor x, int downscale_factor=1, str data_format="NCHW") -> Tensor(out)
+  args : (Tensor out_grad, int downscale_factor, str data_format)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : PixelUnshuffleGradInferMeta
+  kernel :
+    func : pixel_unshuffle_grad
+
 - backward_op : poisson_grad
   forward : poisson (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 31125b8df0ce7..8a85147a66da0 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -2103,6 +2103,13 @@
   outputs :
     out : Out
 
+- op : pixel_unshuffle
+  backward : pixel_unshuffle_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
 - op : poisson
   inputs :
     x : X
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 4e67144ba8a89..c93f94c2b3320 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1934,6 +1934,15 @@
     func : pixel_shuffle
   backward : pixel_shuffle_grad
 
+- op : pixel_unshuffle
+  args : (Tensor x, int downscale_factor=1, str data_format="NCHW")
+  output : Tensor
+  infer_meta :
+    func : PixelUnshuffleInferMeta
+  kernel :
+    func : pixel_unshuffle
+  backward : pixel_unshuffle_grad
+
 - op : poisson
   args : (Tensor x)
   output : Tensor
diff --git a/paddle/phi/ops/compat/pixel_unshuffle_sig.cc b/paddle/phi/ops/compat/pixel_unshuffle_sig.cc
deleted file mode 100644
index 6c983c1e24c28..0000000000000
--- a/paddle/phi/ops/compat/pixel_unshuffle_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature PixelUnshuffleGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx UNUSED) {
-  return KernelSignature("pixel_unshuffle_grad",
-                         {"Out@GRAD"},
-                         {"downscale_factor", "data_format"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(pixel_unshuffle_grad,
-                           phi::PixelUnshuffleGradOpArgumentMapping);
diff --git a/test/legacy_test/test_pixel_unshuffle.py b/test/legacy_test/test_pixel_unshuffle.py
index ec6ce803d1277..eb2c287b3f886 100644
--- a/test/legacy_test/test_pixel_unshuffle.py
+++ b/test/legacy_test/test_pixel_unshuffle.py
@@ -69,8 +69,8 @@ def pixel_unshuffle_np(x, down_factor, data_format="NCHW"):
 
 
 def pixel_unshuffle_wrapper(x, downscale_factor, data_format):
-    return paddle._legacy_C_ops.pixel_unshuffle(
-        x, "downscale_factor", downscale_factor, "data_format", data_format
+    return paddle.nn.functional.pixel_unshuffle(
+        x, downscale_factor, data_format
     )
 
 

From c4dd10935231f0cf4253225e912a195435dd2d2b Mon Sep 17 00:00:00 2001
From: Zhenghai Zhang <65210872+ccsuzzh@users.noreply.github.com>
Date: Thu, 21 Sep 2023 11:07:18 +0800
Subject: [PATCH 022/115] [CodeStyle][task 39] enable isort in
 `python/paddle/base` (part1) (#57413)

* enable isort rule in python/paddle/base

* fix bug

* fix bug

* fix bug
---
 pyproject.toml                                |  3 +-
 python/paddle/base/backward.py                | 19 ++++----
 python/paddle/base/compiler.py                | 19 ++++----
 python/paddle/base/data_feed_desc.py          |  3 +-
 python/paddle/base/data_feeder.py             | 13 +++---
 python/paddle/base/dataset.py                 |  6 ++-
 python/paddle/base/default_scope_funcs.py     |  3 +-
 python/paddle/base/dygraph/base.py            | 16 ++++---
 python/paddle/base/dygraph/math_op_patch.py   | 10 ++---
 .../base/dygraph/tensor_patch_methods.py      | 34 +++++++-------
 python/paddle/base/dygraph/tracer.py          |  3 +-
 python/paddle/base/dygraph_utils.py           |  3 +-
 python/paddle/base/executor.py                | 33 ++++++--------
 .../incubate/checkpoint/auto_checkpoint.py    | 11 ++---
 python/paddle/base/initializer.py             |  3 +-
 python/paddle/base/io.py                      |  1 +
 python/paddle/base/layer_helper.py            | 11 ++---
 python/paddle/base/layer_helper_base.py       |  9 ++--
 python/paddle/base/layers/io.py               |  6 +--
 .../base/layers/layer_function_generator.py   | 13 +++---
 python/paddle/base/layers/math_op_patch.py    | 11 +++--
 python/paddle/base/lod_tensor.py              |  3 +-
 python/paddle/base/multiprocess_utils.py      |  7 ++-
 python/paddle/base/param_attr.py              |  2 +-
 python/paddle/base/reader.py                  | 44 +++++++++----------
 python/paddle/base/trainer_desc.py            |  2 +-
 python/paddle/base/trainer_factory.py         | 26 ++++++-----
 python/paddle/base/unique_name.py             |  3 +-
 python/paddle/base/variable_index.py          | 24 +++++-----
 python/paddle/base/wrapped_decorator.py       |  3 +-
 30 files changed, 172 insertions(+), 172 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e11ab2108c2be..8dd98b65873aa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,9 @@ skip = ["build", "third_party", "__init__.py"]
 extend_skip_glob = [
     # These files do not need to be formatted,
     # see .flake8 for more details
-    "python/paddle/base/**",
     "python/paddle/utils/gast/**",
+    "python/paddle/base/core.py",
+    "python/paddle/base/framework.py",
 ]
 
 [tool.ruff]
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 563e423e0c7ea..1f3f67a98b640 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -12,23 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .proto import framework_pb2
-
-from paddle.base import framework as framework
-from paddle.base import program_guard
-from . import core
 import collections
 import copy
 import logging
-from . import unique_name
-from . import log_helper
-import paddle.base
-from .data_feeder import check_type
+import re
 import warnings
-
 from collections.abc import Sequence
 
-import re
+import paddle.base
+from paddle.base import framework as framework
+from paddle.base import program_guard
+
+from . import core, log_helper, unique_name
+from .data_feeder import check_type
+from .proto import framework_pb2
 
 __all__ = [
     'append_backward',
diff --git a/python/paddle/base/compiler.py b/python/paddle/base/compiler.py
index 69ae6f1d31344..3ee939920dc2b 100644
--- a/python/paddle/base/compiler.py
+++ b/python/paddle/base/compiler.py
@@ -14,9 +14,9 @@
 
 import sys
 import warnings
-from . import framework
-from .framework import cuda_places, cpu_places, xpu_places
-from . import core
+
+from . import core, framework
+from .framework import cpu_places, cuda_places, xpu_places
 
 __all__ = [
     'CompiledProgram',
@@ -399,10 +399,11 @@ def convert_concrete_program(
         """
         Convert the ConcreteProgram to IPUConcreteProgram.
         """
-        from ..base.dygraph.base import switch_to_static_graph
+        import paddle
+
         from ..base import backward
+        from ..base.dygraph.base import switch_to_static_graph
         from ..base.framework import device_guard
-        import paddle
 
         inputs = concrete_program.inputs
         outputs = concrete_program.outputs
@@ -508,14 +509,12 @@ def patch_program_cache(ipu_strategy):
         Returns:
             None
         """
+        from paddle.jit.dy2static import logging_utils
+        from paddle.jit.dy2static.partial_program import partial_program_from
         from paddle.jit.dy2static.program_translator import (
+            MAX_TRACED_PROGRAM_COUNT,
             CacheKey,
             ProgramCache,
-            MAX_TRACED_PROGRAM_COUNT,
-        )
-        from paddle.jit.dy2static import logging_utils
-        from paddle.jit.dy2static.partial_program import (
-            partial_program_from,
         )
 
         old_getter = ProgramCache.__getitem__
diff --git a/python/paddle/base/data_feed_desc.py b/python/paddle/base/data_feed_desc.py
index 8aa69890f1933..de1b00d090bb1 100644
--- a/python/paddle/base/data_feed_desc.py
+++ b/python/paddle/base/data_feed_desc.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.base.proto import data_feed_pb2
 from google.protobuf import text_format
 
+from paddle.base.proto import data_feed_pb2
+
 __all__ = ['DataFeedDesc']
 
 
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index 78781a6856af1..52ed983ffa729 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -12,20 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import core
-import numpy as np
-import warnings
 import struct
+import warnings
+
+import numpy as np
 
+from ..ir import OpResult
+from . import core
 from .framework import (
     Variable,
+    _cpu_num,
+    _cuda_ids,
     default_main_program,
     in_dygraph_mode,
     in_pir_mode,
 )
-from .framework import _cpu_num, _cuda_ids
-
-from ..ir import OpResult
 
 __all__ = ['DataFeeder']
 
diff --git a/python/paddle/base/dataset.py b/python/paddle/base/dataset.py
index 533fb69a6621b..961a392349707 100644
--- a/python/paddle/base/dataset.py
+++ b/python/paddle/base/dataset.py
@@ -13,10 +13,12 @@
 # limitations under the License.
 """This is definition of dataset class, which is high performance IO."""
 
-from paddle.base.proto import data_feed_pb2
 from google.protobuf import text_format
-from . import core
+
+from paddle.base.proto import data_feed_pb2
+
 from ..utils import deprecated
+from . import core
 
 __all__ = ['DatasetFactory', 'InMemoryDataset', 'QueueDataset']
 
diff --git a/python/paddle/base/default_scope_funcs.py b/python/paddle/base/default_scope_funcs.py
index 80cfe40db57ad..992714e6cd409 100644
--- a/python/paddle/base/default_scope_funcs.py
+++ b/python/paddle/base/default_scope_funcs.py
@@ -26,9 +26,10 @@
 invoked in a new local scope.
 """
 
-import paddle.base.core
 import threading
 
+import paddle.base.core
+
 __tl_scope__ = threading.local()
 
 __all__ = [
diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py
index 7edb748026d84..d85fc8ca25bf7 100644
--- a/python/paddle/base/dygraph/base.py
+++ b/python/paddle/base/dygraph/base.py
@@ -11,20 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
-import decorator
 import inspect
 import sys
+import warnings
+
+import decorator
 import numpy as np
-from paddle.base import core
-from paddle.base import framework
+
+import paddle
+from paddle.base import core, framework
 from paddle.base.framework import global_var
 from paddle.base.multiprocess_utils import CleanupFuncRegistrar
-from .tracer import Tracer
+
 from ..data_feeder import convert_dtype
-import warnings
 from ..framework import _get_paddle_place
-import paddle
+from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
+from .tracer import Tracer
 
 __all__ = [
     'no_grad',
diff --git a/python/paddle/base/dygraph/math_op_patch.py b/python/paddle/base/dygraph/math_op_patch.py
index 9448d7d9de9dd..5972b545f93e2 100644
--- a/python/paddle/base/dygraph/math_op_patch.py
+++ b/python/paddle/base/dygraph/math_op_patch.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .. import core
-from ..framework import (
-    convert_np_dtype_to_dtype_,
-)
-from .. import framework
-
 import numpy as np
+
 from paddle import _C_ops, _legacy_C_ops
 
+from .. import core, framework
+from ..framework import convert_np_dtype_to_dtype_
+
 _supported_int_dtype_ = [
     core.VarDesc.VarType.UINT8,
     core.VarDesc.VarType.INT8,
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index 8026884c34fc8..4f1b138abaae4 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -13,33 +13,33 @@
 # limitations under the License.
 
 import inspect
-import numpy as np
-import warnings
 import sys
+import warnings
+
+import numpy as np
 
 import paddle
-from .. import framework
-from ..framework import convert_np_dtype_to_dtype_
-from .. import core
-from .. import unique_name
+import paddle.profiler as profiler
+import paddle.utils.deprecated as deprecated
+from paddle import _C_ops
+from paddle.base.data_feeder import (
+    _PADDLE_DTYPE_2_NUMPY_DTYPE,
+    convert_uint16_to_float,
+)
+from paddle.profiler.utils import in_profiler_mode
+
+from .. import core, framework, unique_name
 from ..framework import (
-    Variable,
+    EagerParamBase,
     Parameter,
+    Variable,
     _getitem_static,
-    _setitem_static,
     _setitem_impl_,
-    EagerParamBase,
+    _setitem_static,
+    convert_np_dtype_to_dtype_,
 )
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_tensor
-from paddle.base.data_feeder import (
-    convert_uint16_to_float,
-    _PADDLE_DTYPE_2_NUMPY_DTYPE,
-)
-import paddle.utils.deprecated as deprecated
-import paddle.profiler as profiler
-from paddle.profiler.utils import in_profiler_mode
-from paddle import _C_ops
 
 _grad_scalar = None
 
diff --git a/python/paddle/base/dygraph/tracer.py b/python/paddle/base/dygraph/tracer.py
index 35cbe88f91f87..4df9517073c66 100644
--- a/python/paddle/base/dygraph/tracer.py
+++ b/python/paddle/base/dygraph/tracer.py
@@ -13,9 +13,8 @@
 # limitations under the License.
 
 
-from paddle.base import core
-from paddle.base import framework
 from paddle import _C_ops, _legacy_C_ops
+from paddle.base import core, framework
 
 name_mapping = {
     "graph_send_recv": {
diff --git a/python/paddle/base/dygraph_utils.py b/python/paddle/base/dygraph_utils.py
index 655a5f4f8b773..926c4680017ce 100644
--- a/python/paddle/base/dygraph_utils.py
+++ b/python/paddle/base/dygraph_utils.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .framework import dygraph_only
 from paddle import _legacy_C_ops
 
+from .framework import dygraph_only
+
 
 @dygraph_only
 def _append_activation_in_dygraph(
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index e5fddd15329e3..0921d7b79d14b 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -12,36 +12,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import logging
 import os
 import sys
 import warnings
-import numpy as np
+from functools import lru_cache
 
-from . import set_flags, get_flags
-from .framework import Program, default_main_program
+import numpy as np
 
 from ..ir import OpResult
-from .wrapped_decorator import signature_safe_contextmanager
+from . import compiler, core, framework, get_flags, set_flags, unique_name
 from .data_feeder import convert_dtype
-from .framework import Variable, Operator, in_pir_mode
-
 from .framework import (
-    convert_np_dtype_to_dtype_,
+    Operator,
+    Program,
+    Variable,
     _apply_pass,
+    convert_np_dtype_to_dtype_,
+    default_main_program,
+    in_pir_mode,
     paddle_type_to_proto_type,
 )
-
-from . import core
-from . import unique_name
-from . import compiler
-from .trainer_factory import TrainerFactory
-from .trainer_factory import FetchHandlerMonitor
-import copy
-from . import framework
 from .incubate.checkpoint import auto_checkpoint as acp
-
-from functools import lru_cache
+from .trainer_factory import FetchHandlerMonitor, TrainerFactory
+from .wrapped_decorator import signature_safe_contextmanager
 
 __all__ = ['Executor', 'global_scope', 'scope_guard']
 
@@ -614,8 +609,8 @@ def _to_str(var):
 
 
 def _prepare_fleet_executor():
-    from ..distributed.fleet.proto import fleet_executor_desc_pb2
     from ..distributed.backup_env import getenv_or_backup
+    from ..distributed.fleet.proto import fleet_executor_desc_pb2
 
     trainer_endpoints_str = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS", "")
     trainer_endpoints = trainer_endpoints_str.split(',')
@@ -945,7 +940,7 @@ def _get_program_and_executor(self, cached_data):
             # print(f"Program after convert:\n {inner_program}", flush=True)
         else:
             build_strategy = None
-            from paddle.incubate.autograd import prim_enabled, prim2orig
+            from paddle.incubate.autograd import prim2orig, prim_enabled
 
             if prim_enabled() and program == default_main_program():
                 prim2orig()
diff --git a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
index 23239b692c975..e8f75f3a4ed55 100644
--- a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
@@ -12,16 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import logging
 import json
+import logging
 import os
+import sys
 import time
 from threading import current_thread
 
-from paddle.base import unique_name, compiler
-from .checkpoint_saver import SerializableBase, CheckpointSaver, PaddleModel
-from paddle.base.framework import in_dygraph_mode, Program
+from paddle.base import compiler, unique_name
+from paddle.base.framework import Program, in_dygraph_mode
+
+from .checkpoint_saver import CheckpointSaver, PaddleModel, SerializableBase
 
 g_train_epoch_range = None
 g_checker = None
diff --git a/python/paddle/base/initializer.py b/python/paddle/base/initializer.py
index 3902281721eac..7443e63b13e52 100644
--- a/python/paddle/base/initializer.py
+++ b/python/paddle/base/initializer.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .data_feeder import check_type
 import paddle
 
+from .data_feeder import check_type
+
 __all__ = ['set_global_initializer']
 
 _global_weight_initializer_ = None
diff --git a/python/paddle/base/io.py b/python/paddle/base/io.py
index 89468f88648e8..a2c7d02ede349 100644
--- a/python/paddle/base/io.py
+++ b/python/paddle/base/io.py
@@ -15,6 +15,7 @@
 import logging
 
 from paddle.base.log_helper import get_logger
+
 from . import reader
 from .reader import *
 
diff --git a/python/paddle/base/layer_helper.py b/python/paddle/base/layer_helper.py
index e6be93e777b75..312eaf67a3320 100644
--- a/python/paddle/base/layer_helper.py
+++ b/python/paddle/base/layer_helper.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 
 import copy
+
 import paddle
+
+from . import unique_name
+from .dygraph_utils import _append_activation_in_dygraph
 from .framework import (
     Parameter,
+    _global_flags,
     dtype_is_floating,
     in_dygraph_mode,
-    _global_flags,
 )
-from . import unique_name
-from .param_attr import ParamAttr
-
 from .layer_helper_base import LayerHelperBase
-from .dygraph_utils import _append_activation_in_dygraph
+from .param_attr import ParamAttr
 
 
 class LayerHelper(LayerHelperBase):
diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py
index b7bc6c6b8585e..6c047c08766fe 100644
--- a/python/paddle/base/layer_helper_base.py
+++ b/python/paddle/base/layer_helper_base.py
@@ -13,21 +13,22 @@
 # limitations under the License.
 
 import copy
+
 import numpy as np
+
 import paddle
 
+from . import core, unique_name
 from .framework import (
     Variable,
+    _current_expected_place,
     default_main_program,
     default_startup_program,
     in_dygraph_mode,
     in_pir_mode,
-    _current_expected_place,
 )
-from . import unique_name
+from .initializer import _global_bias_initializer, _global_weight_initializer
 from .param_attr import ParamAttr, WeightNormParamAttr
-from . import core
-from .initializer import _global_weight_initializer, _global_bias_initializer
 
 __all__ = ['LayerHelperBase']
 
diff --git a/python/paddle/base/layers/io.py b/python/paddle/base/layers/io.py
index d4aa7734aee6f..51f5b10fe0618 100644
--- a/python/paddle/base/layers/io.py
+++ b/python/paddle/base/layers/io.py
@@ -14,13 +14,9 @@
 
 from .. import core
 from ..executor import global_scope
-from ..framework import (
-    default_main_program,
-    default_startup_program,
-)
+from ..framework import default_main_program, default_startup_program
 from ..unique_name import generate as unique_name
 
-
 __all__ = []
 
 
diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py
index 1b1b85d00ea42..bd11a412ffc5b 100644
--- a/python/paddle/base/layers/layer_function_generator.py
+++ b/python/paddle/base/layers/layer_function_generator.py
@@ -13,21 +13,22 @@
 # limitations under the License.
 
 import re
-import warnings
 import string
-
+import warnings
 from io import StringIO
-from ..proto import framework_pb2
+
+from paddle import _C_ops, _legacy_C_ops
+
+from ..data_feeder import check_variable_and_dtype
 from ..framework import (
     OpProtoHolder,
     Variable,
-    core,
     convert_np_dtype_to_dtype_,
+    core,
     in_dygraph_mode,
 )
 from ..layer_helper import LayerHelper
-from ..data_feeder import check_variable_and_dtype
-from paddle import _C_ops, _legacy_C_ops
+from ..proto import framework_pb2
 
 __all__ = [
     'generate_layer_fn',
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index 06f384eae23d1..53f35939b1f3a 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
 import inspect
+import warnings
+
+from paddle.base.dygraph.base import in_to_static_mode
 
 from .. import core
-from ..framework import Variable, unique_name, static_only
+from ..framework import Variable, static_only, unique_name
 from .layer_function_generator import OpProtoHolder
-from paddle.base.dygraph.base import in_to_static_mode
 
 _supported_int_dtype_ = [
     core.VarDesc.VarType.BOOL,
@@ -354,9 +355,7 @@ def pop(self, *args):
         Returns:
             Variable: self[index]
         """
-        from paddle.jit.dy2static.convert_operators import (
-            _run_paddle_pop,
-        )
+        from paddle.jit.dy2static.convert_operators import _run_paddle_pop
 
         if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
diff --git a/python/paddle/base/lod_tensor.py b/python/paddle/base/lod_tensor.py
index 96e18ec8f3bde..4be41d5cc6adc 100644
--- a/python/paddle/base/lod_tensor.py
+++ b/python/paddle/base/lod_tensor.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
+
 from . import core
 from .data_feeder import DataToLoDTensorConverter
-import numpy as np
 
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
 
diff --git a/python/paddle/base/multiprocess_utils.py b/python/paddle/base/multiprocess_utils.py
index b763446930fdb..8d18db0bb3ea8 100644
--- a/python/paddle/base/multiprocess_utils.py
+++ b/python/paddle/base/multiprocess_utils.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import signal
 import atexit
+import queue
+import signal
+import sys
 
 from . import core
 
-import queue
-
 # multi-process worker check indices queue interval, avoid
 # hanging in subprocess data loading
 MP_STATUS_CHECK_INTERVAL = 5.0
diff --git a/python/paddle/base/param_attr.py b/python/paddle/base/param_attr.py
index a17432fcc3df2..674c4ad4328c5 100644
--- a/python/paddle/base/param_attr.py
+++ b/python/paddle/base/param_attr.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 import paddle
-from paddle.regularizer import WeightDecayRegularizer
 from paddle.base.data_feeder import check_type
+from paddle.regularizer import WeightDecayRegularizer
 
 __all__ = [
     'ParamAttr',
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index 63b97ee2bd495..c3a65721db275 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -12,44 +12,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import core
+import logging
+import multiprocessing
+import queue
 import sys
-import numpy as np
 import threading
+import warnings
+
+import numpy as np
+
 import paddle
+from paddle.base.framework import _set_expected_place
 
+from . import core
+from .data_feeder import BatchedTensorProvider, DataFeeder
+from .executor import global_scope
 from .framework import (
     Program,
-    program_guard,
+    _current_expected_place,
+    _get_paddle_place,
+    _get_paddle_place_list,
     default_main_program,
     default_startup_program,
     in_dygraph_mode,
-    _current_expected_place,
+    program_guard,
 )
-from .executor import global_scope
-from .data_feeder import DataFeeder, BatchedTensorProvider
+from .layers.io import (
+    __create_unshared_decorated_reader__,
+    _copy_reader_var_,
+    monkey_patch_reader_methods,
+)
+from .multiprocess_utils import _cleanup  # noqa: F401
+from .multiprocess_utils import multiprocess_queue_set  # noqa: F401
 from .multiprocess_utils import (
-    multiprocess_queue_set,  # noqa: F401
     CleanupFuncRegistrar,
     _cleanup_mmap,
-    _cleanup,  # noqa: F401
     _set_SIGCHLD_handler,
 )
-from .layers.io import (
-    monkey_patch_reader_methods,
-    _copy_reader_var_,
-    __create_unshared_decorated_reader__,
-)
 from .unique_name import UniqueNameGenerator
-from .framework import _get_paddle_place, _get_paddle_place_list
-from paddle.base.framework import _set_expected_place
-import logging
-import warnings
-
-### Dygraph DataLoader configs ###
-import multiprocessing
-
-import queue
 
 # NOTE: [ avoid hanging & failed quickly ] These value is used in getting data from another process
 QUEUE_GET_TIMEOUT = 60
diff --git a/python/paddle/base/trainer_desc.py b/python/paddle/base/trainer_desc.py
index 48cc427ac8e7e..f64530ec02353 100644
--- a/python/paddle/base/trainer_desc.py
+++ b/python/paddle/base/trainer_desc.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 """Definition of trainers."""
 
-import sys
 import os
+import sys
 
 __all__ = [
     'TrainerDesc',
diff --git a/python/paddle/base/trainer_factory.py b/python/paddle/base/trainer_factory.py
index cf197fab524e0..75351872d73d6 100644
--- a/python/paddle/base/trainer_factory.py
+++ b/python/paddle/base/trainer_factory.py
@@ -13,33 +13,35 @@
 # limitations under the License.
 """Definition of TrainerFactory."""
 
+import logging
 import threading
 import time
-import logging
+
 import numpy as np
+
 from paddle.base.log_helper import get_logger
 
 local_logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
 )
 
-from .trainer_desc import (  # noqa: F401
-    MultiTrainer,
-    DistMultiTrainer,
-    PipelineTrainer,
-    HeterXpuTrainer,
-    PSGPUTrainer,
-    HeterPipelineTrainer,
-)
 from .device_worker import (  # noqa: F401
-    Hogwild,
-    DownpourSGD,
     DownpourLite,
-    Section,
+    DownpourSGD,
     DownpourSGDOPT,
     HeterSection,
+    Hogwild,
+    Section,
 )
 from .framework import Variable
+from .trainer_desc import (  # noqa: F401
+    DistMultiTrainer,
+    HeterPipelineTrainer,
+    HeterXpuTrainer,
+    MultiTrainer,
+    PipelineTrainer,
+    PSGPUTrainer,
+)
 
 __all__ = ["TrainerFactory", "FetchHandlerMonitor"]
 
diff --git a/python/paddle/base/unique_name.py b/python/paddle/base/unique_name.py
index 745675767f150..c240273da890d 100644
--- a/python/paddle/base/unique_name.py
+++ b/python/paddle/base/unique_name.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import collections
+
 from .wrapped_decorator import signature_safe_contextmanager
 
 __all__ = ['generate', 'switch', 'guard']
@@ -121,7 +122,7 @@ def generate(key):
 # NOTE(zhiqiu): use c++ unique_name_generator in dygraph mode,
 # in order to keep name consistency.
 def generate_with_ignorable_key(key):
-    from .framework import in_dygraph_mode, _dygraph_tracer
+    from .framework import _dygraph_tracer, in_dygraph_mode
 
     if in_dygraph_mode():
         return _dygraph_tracer()._generate_unique_name()
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index 1b3039c5a8cbe..dcc87b74ea658 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import itertools
+import warnings
+
 import numpy as np
-from . import unique_name
-from . import core
+
 import paddle
-import warnings
-import itertools
 
+from . import core, unique_name
 
 MAX_INTEGER = 2**31 - 1
 
@@ -370,9 +371,7 @@ def _setitem_for_tensor_array(var, item, value):
         not paddle.in_dynamic_mode()
     ), "setitem for tensor_array must be called in static graph mode."
     if isinstance(item, (Variable, int)):
-        from paddle.jit.dy2static.variable_trans_func import (
-            to_static_variable,
-        )
+        from paddle.jit.dy2static.variable_trans_func import to_static_variable
         from paddle.tensor import array_write
 
         item = paddle.cast(to_static_variable(item), dtype='int64')
@@ -388,7 +387,8 @@ def _setitem_for_tensor_array(var, item, value):
 
 def _setitem_impl_(var, item, value):
     from paddle.base import core
-    from .framework import default_main_program, Variable
+
+    from .framework import Variable, default_main_program
 
     if var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         return _setitem_for_tensor_array(var, item, value)
@@ -572,9 +572,7 @@ def _setitem_impl_(var, item, value):
 
     if not paddle.in_dynamic_mode():
         # map var to the new output
-        from paddle.jit.dy2static.program_translator import (
-            ProgramTranslator,
-        )
+        from paddle.jit.dy2static.program_translator import ProgramTranslator
 
         ProgramTranslator.get_instance()._inplace_map.add(
             cur_block.program, var.desc.id(), output
@@ -601,8 +599,8 @@ def set_value_for_bool_tensor(var, item, value):
             )
 
     def idx_not_empty(var, item, value):
-        from .framework import Variable
         from ..tensor import gather_nd, scatter_nd_add
+        from .framework import Variable
 
         if not isinstance(value, Variable):
             value = paddle.assign(value).cast(var.dtype)
@@ -826,7 +824,7 @@ def _setitem_static(x, indices, values):
         indices(int|slice|None|Tensor|List|Tuple...): Indices, used to indicate the position of the element to be fetched.
         values(Tensor|Number|Ndarray): values to be assigned to the x.
     """
-    from .framework import default_main_program, Variable
+    from .framework import Variable, default_main_program
 
     if x.type == paddle.base.core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         return _setitem_for_tensor_array(x, indices, values)
diff --git a/python/paddle/base/wrapped_decorator.py b/python/paddle/base/wrapped_decorator.py
index 7e7dbff65611e..1567bb0d4c55c 100644
--- a/python/paddle/base/wrapped_decorator.py
+++ b/python/paddle/base/wrapped_decorator.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import decorator
 import contextlib
 
+import decorator
+
 __all__ = ['wrap_decorator', 'signature_safe_contextmanager']
 
 

From eccee58b71d66c041b7c6c2554f1b83976eb4d9b Mon Sep 17 00:00:00 2001
From: Ghost Screaming <mofengshenjieII@163.com>
Date: Thu, 21 Sep 2023 11:10:14 +0800
Subject: [PATCH 023/115] [AutoParallel] Support new communication library for
 hogwild_worker, graph_helper, data_norm_op and margin_cross_entropy_op.
 (#57519)

---
 paddle/fluid/framework/hogwild_worker.cc      |  69 +++++++--
 paddle/fluid/framework/ir/graph_helper.cc     |  17 +-
 paddle/fluid/operators/data_norm_op.cu        | 115 +++++++++++---
 .../operators/margin_cross_entropy_op.cu      | 145 +++++++++++++-----
 .../core/distributed/comm_context_manager.cc  |  14 ++
 .../core/distributed/comm_context_manager.h   |   8 +
 6 files changed, 292 insertions(+), 76 deletions(-)

diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index cc2c70506a34c..e638fbcb8a54d 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -22,6 +22,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/lodtensor_printer.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/flags.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
+#endif
 
 #if defined PADDLE_WITH_PSCORE
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
@@ -30,7 +37,6 @@ limitations under the License. */
 #if defined(PADDLE_WITH_GLOO)
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
-#include "paddle/phi/core/flags.h"
 
 PHI_DECLARE_bool(enable_exit_when_partial_worker);
 
@@ -152,16 +158,59 @@ bool HogwildWorker::CheckBatchNum(int flag) {
   }
   g_barrier.wait();
   float *stat_ptr = sync_stat_.data<float>();
-  auto comm =
-      platform::NCCLCommContext::Instance().Get(0, place_.GetDeviceId());
+  int nranks = 0;
+  int ring_id = 0;
+  platform::NCCLComm *comm = nullptr;
+  const auto &comm_context_manager =
+      phi::distributed::CommContextManager::GetInstance();
+  phi::distributed::NCCLCommContext *comm_ctx = nullptr;
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "You choose to use new communication library by "
+                          "setting environment "
+                          "variable FLAGS_dynamic_static_unified_comm True. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(ring_id)));
+    comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+        comm_context_manager.Get(std::to_string(ring_id)));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      platform::errors::Unavailable(
+                          "NCCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+    nranks = comm_ctx->GetSize();
+  } else {
+    comm = platform::NCCLCommContext::Instance().Get(ring_id,
+                                                     place_.GetDeviceId());
+    nranks = comm->nranks();
+  }
+
   auto stream = static_cast<phi::GPUContext *>(dev_ctx_)->stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag],
-                                                              &stat_ptr[2],
-                                                              1,
-                                                              ncclFloat32,
-                                                              ncclProd,
-                                                              comm->comm(),
-                                                              stream));
+  if (comm_ctx) {
+    // comm_ctx->AllReduce only support allreduce on the whole tensor,
+    // single element is not supported now.
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::ncclAllReduce(&stat_ptr[flag],
+                                         &stat_ptr[2],
+                                         1,
+                                         ncclFloat32,
+                                         ncclProd,
+                                         comm_ctx->GetNcclComm(),
+                                         stream));
+
+  } else {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag],
+                                                                &stat_ptr[2],
+                                                                1,
+                                                                ncclFloat32,
+                                                                ncclProd,
+                                                                comm->comm(),
+                                                                stream));
+  }
+
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret,  // output
                                              &stat_ptr[2],
                                              sizeof(float),
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index b322e3f8bce28..5d7054721db53 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -23,10 +23,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 #include "paddle/fluid/platform/flags.h"
 PD_DECLARE_bool(convert_all_blocks);
@@ -564,9 +568,16 @@ void ReplaceAllReduceOp(const Node &node,
   all_reduce_op_desc.SetType("c_allreduce_sum");
   all_reduce_op_desc.SetInput("X", {all_reduce_var_name});
   all_reduce_op_desc.SetOutput("Out", {all_reduce_var_name});
-
-  int ring_id = platform::NCCLCommContext::Instance().GetRingId(
-      dynamic_cast<details::NCCLOpHandleBase *>(&op_handle)->GetComm());
+  int ring_id = -1;
+  if (FLAGS_dynamic_static_unified_comm) {
+    ring_id = phi::distributed::CommContextManager::GetInstance().GetRingId(
+        dynamic_cast<details::NCCLOpHandleBase *>(&op_handle)->GetComm());
+    VLOG(3) << "New CommContextManager gets ring_id: " << ring_id;
+  } else {
+    ring_id = platform::NCCLCommContext::Instance().GetRingId(
+        dynamic_cast<details::NCCLOpHandleBase *>(&op_handle)->GetComm());
+    VLOG(3) << "Old NCCLCommContext gets ring_id: " << ring_id;
+  }
   all_reduce_op_desc.SetAttr("ring_id", ring_id);
   all_reduce_op_desc.SetAttr("use_calc_stream", false);
   all_reduce_op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index a212bc0ee9478..509c067e24e42 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -21,6 +21,10 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 namespace paddle {
@@ -213,31 +217,92 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
 
     if (need_sync_stats) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          reinterpret_cast<const void *>(d_batch_size),
-          reinterpret_cast<void *>(d_batch_size),
-          C,
-          platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())),
-          ncclSum,
-          comm->comm(),
-          stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          reinterpret_cast<const void *>(d_batch_sum),
-          reinterpret_cast<void *>(d_batch_sum),
-          C,
-          platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())),
-          ncclSum,
-          comm->comm(),
-          stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          reinterpret_cast<const void *>(d_batch_square_sum),
-          reinterpret_cast<void *>(d_batch_square_sum),
-          C,
-          platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype())),
-          ncclSum,
-          comm->comm(),
-          stream));
+      int rid = 0;
+      platform::NCCLComm *comm = nullptr;
+      const auto &comm_context_manager =
+          phi::distributed::CommContextManager::GetInstance();
+      phi::distributed::NCCLCommContext *comm_ctx = nullptr;
+      if (FLAGS_dynamic_static_unified_comm) {
+        PADDLE_ENFORCE_EQ(
+            comm_context_manager.Has(std::to_string(rid)),
+            true,
+            platform::errors::InvalidArgument(
+                "You choose to use new communication library by "
+                "setting environment "
+                "variable FLAGS_dynamic_static_unified_comm True. "
+                "But ring_id(%d) is "
+                "not found in comm_context_manager.",
+                std::to_string(rid)));
+        comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+            comm_context_manager.Get(std::to_string(rid)));
+        PADDLE_ENFORCE_NE(
+            comm_ctx,
+            nullptr,
+            platform::errors::Unavailable(
+                "NCCLCommContext is nullptr, collective op should "
+                "has ring_id attr."));
+      } else {
+        comm = paddle::platform::NCCLCommContext::Instance().Get(
+            rid, ctx.GetPlace());
+      }
+
+      if (comm_ctx) {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_size),
+            reinterpret_cast<void *>(d_batch_size),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm_ctx->GetNcclComm(),
+            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_sum),
+            reinterpret_cast<void *>(d_batch_sum),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm_ctx->GetNcclComm(),
+            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_square_sum),
+            reinterpret_cast<void *>(d_batch_square_sum),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm_ctx->GetNcclComm(),
+            stream));
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_size),
+            reinterpret_cast<void *>(d_batch_size),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm->comm(),
+            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_sum),
+            reinterpret_cast<void *>(d_batch_sum),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm->comm(),
+            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+            reinterpret_cast<const void *>(d_batch_square_sum),
+            reinterpret_cast<void *>(d_batch_square_sum),
+            C,
+            platform::ToNCCLDataType(
+                framework::TransToProtoVarType(x->dtype())),
+            ncclSum,
+            comm->comm(),
+            stream));
+      }
       platform::GpuStreamSync(stream);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index d741bc5b42549..75ef56accb10b 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -30,6 +30,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"
 
 #include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
@@ -39,6 +40,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 
@@ -87,21 +91,50 @@ void GetClassInterval(const gpuStream_t& stream,
     auto task = pg->AllReduce(in_tensor, out_tensor, opts);
     task->Wait();
   } else {
-    const auto& comm =
-        paddle::platform::NCCLCommContext::Instance().Get(rid, place);
+    paddle::platform::NCCLComm* comm = nullptr;
+    const auto& comm_context_manager =
+        phi::distributed::CommContextManager::GetInstance();
+    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+    if (FLAGS_dynamic_static_unified_comm) {
+      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
+                        true,
+                        paddle::platform::errors::InvalidArgument(
+                            "You choose to use new communication library by "
+                            "setting environment "
+                            "variable FLAGS_dynamic_static_unified_comm True. "
+                            "But ring_id(%d) is "
+                            "not found in comm_context_manager.",
+                            std::to_string(rid)));
+      comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+          comm_context_manager.Get(std::to_string(rid)));
+      PADDLE_ENFORCE_NE(comm_ctx,
+                        nullptr,
+                        paddle::platform::errors::Unavailable(
+                            "NCCLCommContext is nullptr, collective op should "
+                            "has ring_id attr."));
+    } else {
+      comm = paddle::platform::NCCLCommContext::Instance().Get(rid, place);
+    }
+
     // use global calculate stream
     const auto calcu_stream =
         static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
             ->stream();
-
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-        num_classes_per_device_ptr,
-        num_classes_per_device_ptr,
-        num_classes_per_device.numel(),
-        phi::ToNCCLDataType(num_classes_per_device.dtype()),
-        ncclSum,
-        comm->comm(),
-        calcu_stream));
+    if (comm_ctx) {
+      comm_ctx->AllReduce(&num_classes_per_device,
+                          num_classes_per_device,
+                          ncclSum,
+                          calcu_stream);
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+          num_classes_per_device_ptr,
+          num_classes_per_device_ptr,
+          num_classes_per_device.numel(),
+          phi::ToNCCLDataType(num_classes_per_device.dtype()),
+          ncclSum,
+          comm->comm(),
+          calcu_stream));
+    }
   }
 
   class_interval->Resize({nranks + 1});
@@ -238,7 +271,10 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
   const auto& place = dev_ctx.GetPlace();  // old code
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  paddle::platform::NCCLComm* comm;
+  paddle::platform::NCCLComm* comm = nullptr;
+  const auto& comm_context_manager =
+      phi::distributed::CommContextManager::GetInstance();
+  phi::distributed::NCCLCommContext* comm_ctx = nullptr;
   paddle::distributed::ProcessGroup* pg = nullptr;
   gpuStream_t stream;
   if (nranks > 1) {
@@ -247,8 +283,29 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       // Use ProcessGroup
       pg = map->get(ring_id);
     } else {
-      comm = paddle::platform::NCCLCommContext::Instance().Get(ring_id, place);
-
+      if (FLAGS_dynamic_static_unified_comm) {
+        PADDLE_ENFORCE_EQ(
+            comm_context_manager.Has(std::to_string(ring_id)),
+            true,
+            paddle::platform::errors::InvalidArgument(
+                "You choose to use new communication library by "
+                "setting environment "
+                "variable FLAGS_dynamic_static_unified_comm True. "
+                "But ring_id(%d) is "
+                "not found in comm_context_manager.",
+                std::to_string(ring_id)));
+        comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+            comm_context_manager.Get(std::to_string(ring_id)));
+        PADDLE_ENFORCE_NE(
+            comm_ctx,
+            nullptr,
+            paddle::platform::errors::Unavailable(
+                "NCCLCommContext is nullptr, collective op should "
+                "has ring_id attr."));
+      } else {
+        comm =
+            paddle::platform::NCCLCommContext::Instance().Get(ring_id, place);
+      }
       // use global calculate stream
       stream = static_cast<GPUContext*>(
                    phi::DeviceContextPool::Instance().Get(place))
@@ -361,14 +418,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       auto task = pg->AllReduce(in_tensor, out_tensor, opts);
       task->Wait();
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(logits_max_buff,
-                                      logits_max_buff,
-                                      logits_max.numel(),
-                                      phi::ToNCCLDataType(logits_max.dtype()),
-                                      ncclMax,
-                                      comm->comm(),
-                                      stream));
+      if (comm_ctx) {
+        comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclAllReduce(logits_max_buff,
+                                        logits_max_buff,
+                                        logits_max.numel(),
+                                        phi::ToNCCLDataType(logits_max.dtype()),
+                                        ncclMax,
+                                        comm->comm(),
+                                        stream));
+      }
     }
   }
 #endif
@@ -402,14 +463,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       auto task = pg->AllReduce(in_tensor, out_tensor, opts);
       task->Wait();
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-          sum_exp_logits_buff,
-          sum_exp_logits_buff,
-          sum_exp_logits.numel(),
-          phi::ToNCCLDataType(sum_exp_logits.dtype()),
-          ncclSum,
-          comm->comm(),
-          stream));
+      if (comm_ctx) {
+        comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+            sum_exp_logits_buff,
+            sum_exp_logits_buff,
+            sum_exp_logits.numel(),
+            phi::ToNCCLDataType(sum_exp_logits.dtype()),
+            ncclSum,
+            comm->comm(),
+            stream));
+      }
     }
   }
 #endif
@@ -460,14 +525,18 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       auto task = pg->AllReduce(in_tensor, out_tensor, opts);
       task->Wait();
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(loss_ptr,
-                                      loss_ptr,
-                                      loss->numel(),
-                                      phi::ToNCCLDataType(loss->dtype()),
-                                      ncclSum,
-                                      comm->comm(),
-                                      stream));
+      if (comm_ctx) {
+        comm_ctx->AllReduce(loss, *loss, ncclSum, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclAllReduce(loss_ptr,
+                                        loss_ptr,
+                                        loss->numel(),
+                                        phi::ToNCCLDataType(loss->dtype()),
+                                        ncclSum,
+                                        comm->comm(),
+                                        stream));
+      }
     }
   }
 #endif
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index e7931282724ab..342a86313bf3f 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -176,6 +176,20 @@ CommContext* CommContextManager::Get(const std::string& unique_comm_key) const {
   return id_to_comm_context_.at(unique_comm_key).get();
 }
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+int CommContextManager::GetRingId(const ncclComm_t& comm) const {
+  for (auto iter = id_to_comm_context_.begin();
+       iter != id_to_comm_context_.end();
+       ++iter) {
+    if (static_cast<phi::distributed::NCCLCommContext*>(iter->second.get())
+            ->GetNcclComm() == comm) {
+      return std::stoi(iter->first);
+    }
+  }
+  return -1;
+}
+#endif
+
 bool CommContextManager::Has(const std::string& unique_comm_key) const {
   return id_to_comm_context_.find(unique_comm_key) != id_to_comm_context_.end();
 }
diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h
index e2cb298a0984b..dcbfaab55af90 100644
--- a/paddle/phi/core/distributed/comm_context_manager.h
+++ b/paddle/phi/core/distributed/comm_context_manager.h
@@ -22,6 +22,10 @@
 #include "paddle/phi/core/distributed/comm_context.h"
 #include "paddle/phi/core/macros.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/backends/gpu/forwards.h"
+#endif
+
 namespace phi {
 namespace distributed {
 
@@ -44,6 +48,10 @@ class CommContextManager {
 
   CommContext* Get(const std::string& unique_comm_key) const;
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  int GetRingId(const ncclComm_t& comm) const;
+#endif
+
   bool Has(const std::string& unique_comm_key) const;
 
   static void SetDeviceId(int dev_id);

From b1536e78833f22d1833cfb1171c3e6cb364e7a09 Mon Sep 17 00:00:00 2001
From: iLeGend <824040212@qq.com>
Date: Thu, 21 Sep 2023 11:11:08 +0800
Subject: [PATCH 024/115] [NewComm] No.9 compatiable upgrade for
 fused_attention op (#57560)

* [NewComm] No.9 compatiable upgrade for fused_attention op

* fix error

* fix error
---
 .../operators/fused/fused_attention_utils.h   | 50 +++++++++++++++++--
 test/legacy_test/test_fused_attention_op.py   | 10 ++++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h
index 26cab895f0dfc..c059a194d0ea5 100644
--- a/paddle/fluid/operators/fused/fused_attention_utils.h
+++ b/paddle/fluid/operators/fused/fused_attention_utils.h
@@ -18,8 +18,13 @@
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#include "paddle/fluid/distributed/collective/utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/errors.h"
 
 namespace phi {
@@ -47,11 +52,46 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
     auto place = dev_ctx.GetPlace();
     void *recvbuff =
         dev_ctx.template Alloc<T>(&tensor, tensor.numel() * sizeof(T));
-    auto comm =
-        paddle::platform::NCCLCommContext::Instance().Get(ring_id, place);
-    auto stream = dev_ctx.stream();
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+    gpuStream_t stream = nullptr;
+    paddle::platform::NCCLComm *comm = nullptr;
+    phi::distributed::NCCLCommContext *comm_ctx = nullptr;
+
+    const auto &comm_context_manager =
+        phi::distributed::CommContextManager::GetInstance();
+
+    if (FLAGS_dynamic_static_unified_comm) {
+      // Use New Communication Library
+      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                        true,
+                        paddle::platform::errors::InvalidArgument(
+                            "You choose to use new communication library by "
+                            "setting environment "
+                            "variable FLAGS_dynamic_static_unified_comm True. "
+                            "But ring_id(%d) is "
+                            "not found in comm_context_manager.",
+                            std::to_string(ring_id)));
+      comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+          comm_context_manager.Get(std::to_string(ring_id)));
+      PADDLE_ENFORCE_NE(comm_ctx,
+                        nullptr,
+                        paddle::platform::errors::Unavailable(
+                            "NCCLCommContext is nullptr, collective op should "
+                            "has ring_id attr."));
+
+      stream = comm_ctx->GetStream();
+      VLOG(3) << "new comm_context_manager has ring_id" << ring_id;
+    } else {
+      comm = paddle::platform::NCCLCommContext::Instance().Get(ring_id, place);
+
+      stream = dev_ctx.stream();
+      VLOG(3) << "old NCCLCommContext has ring_id " << ring_id;
+    }
+    if (comm_ctx) {
+      comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream);
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+          sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+    }
   }
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/test/legacy_test/test_fused_attention_op.py b/test/legacy_test/test_fused_attention_op.py
index af734c96d19d8..0e012659f95f6 100644
--- a/test/legacy_test/test_fused_attention_op.py
+++ b/test/legacy_test/test_fused_attention_op.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 
 import numpy as np
@@ -31,6 +32,7 @@
 
 class TestFusedAttentionOp(OpTest):
     def setUp(self):
+        self.with_new_comm()
         self.config()
         self.generate_input_data()
 
@@ -79,6 +81,9 @@ def setUp(self):
         paddle.set_default_dtype(self.x_type)
         self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
 
+    def with_new_comm(self):
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
+
     def config(self):
         self.x_type = np.float32
         self.attn_mask_type = np.float64
@@ -350,6 +355,11 @@ def test_fused_attention_op(self):
         )
 
 
+class TestFusedAttentionOpWithNewComm(TestFusedAttentionOp):
+    def with_new_comm(self):
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "1"
+
+
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
     def config(self):
         super().config()

From 6e9143181a8c4ba7253be9690f198cec8326e5a4 Mon Sep 17 00:00:00 2001
From: ooo oo <106524776+ooooo-create@users.noreply.github.com>
Date: Thu, 21 Sep 2023 11:12:43 +0800
Subject: [PATCH 025/115] [CodeStyle][task 11] enable Ruff F403 rule in
 `python/paddle/base/__init__.py` (#57501)

---
 pyproject.toml                         |  2 --
 python/paddle/base/__init__.py         | 47 +++++++++++++++++++++++---
 python/paddle/base/core.py             |  2 +-
 python/paddle/base/dygraph/__init__.py | 13 +++++--
 python/paddle/base/io.py               |  5 ++-
 5 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8dd98b65873aa..eca2770cb1b4d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -109,8 +109,6 @@ ignore = [
     "UP031",
     "C408",
     "UP030",
-    "F522",
-    "F403",
     "C405",
     "C417",
     "PLR0402",
diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index 6eec276eee03d..acc6f9f51ae2f 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -34,17 +34,48 @@
 
 # import all class inside framework into base module
 from . import framework
-from .framework import *  # noqa: F403
+from .framework import (
+    Program,
+    default_startup_program,
+    default_main_program,
+    program_guard,
+    name_scope,
+    ipu_shard_guard,
+    set_ipu_shard,
+    cuda_places,
+    cpu_places,
+    xpu_places,
+    cuda_pinned_places,
+    in_dygraph_mode,
+    in_pir_mode,
+    in_dynamic_or_pir_mode,
+    is_compiled_with_cinn,
+    is_compiled_with_cuda,
+    is_compiled_with_rocm,
+    is_compiled_with_xpu,
+    Variable,
+    require_version,
+    device_guard,
+    set_flags,
+    get_flags,
+)
 
 # import all class inside executor into base module
 from . import executor
-from .executor import *  # noqa: F403
+from .executor import (
+    Executor,
+    global_scope,
+    scope_guard,
+)
 
 from . import data_feed_desc
-from .data_feed_desc import *  # noqa: F403
+from .data_feed_desc import DataFeedDesc
 
 from . import dataset
-from .dataset import *  # noqa: F403
+from .dataset import (
+    DatasetFactory,
+    InMemoryDataset,
+)
 
 from . import trainer_desc
 
@@ -72,7 +103,13 @@
 
 from . import unique_name
 from . import compiler
-from .compiler import *  # noqa: F403
+from .compiler import (
+    CompiledProgram,
+    ExecutionStrategy,
+    BuildStrategy,
+    IpuCompiledProgram,
+    IpuStrategy,
+)
 from paddle.base.layers.math_op_patch import monkey_patch_variable
 from .dygraph.base import enable_dygraph, disable_dygraph
 from .dygraph.tensor_patch_methods import monkey_patch_tensor
diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py
index df90a6ace8582..285a9f1b1a61b 100644
--- a/python/paddle/base/core.py
+++ b/python/paddle/base/core.py
@@ -278,7 +278,7 @@ def to_list(s):
     # assign tensor alias
     libpaddle.LoDTensor = libpaddle.Tensor
 
-    from .libpaddle import *
+    from .libpaddle import *  # noqa: F403
     from .libpaddle import (  # noqa: F401
         __doc__,
         __file__,
diff --git a/python/paddle/base/dygraph/__init__.py b/python/paddle/base/dygraph/__init__.py
index 6355ca337b9f8..2ac4df711681c 100644
--- a/python/paddle/base/dygraph/__init__.py
+++ b/python/paddle/base/dygraph/__init__.py
@@ -13,10 +13,19 @@
 # limitations under the License.
 
 from . import base
-from .base import *  # noqa: F403
+from .base import (
+    no_grad,
+    no_grad_,
+    grad,
+    guard,
+    enable_dygraph,
+    disable_dygraph,
+    enabled,
+    to_variable,
+)
 
 from . import tracer
-from .tracer import *  # noqa: F403
+from .tracer import Tracer
 
 
 __all__ = []
diff --git a/python/paddle/base/io.py b/python/paddle/base/io.py
index a2c7d02ede349..55f5c072f4e27 100644
--- a/python/paddle/base/io.py
+++ b/python/paddle/base/io.py
@@ -17,7 +17,10 @@
 from paddle.base.log_helper import get_logger
 
 from . import reader
-from .reader import *
+from .reader import (  # noqa: F401
+    PyReader,
+    DataLoader,
+)
 
 __all__ = reader.__all__
 

From 9650cf907fe3d574215e2949785075478096b8d9 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 21 Sep 2023 11:20:02 +0800
Subject: [PATCH 026/115] [PIR]Rename flags (#57496)

* rename flag

* fix py3 bugs

* modify demo code
---
 paddle/fluid/framework/feed_fetch_method.cc   |  2 +-
 .../new_executor/standalone_executor.cc       |  8 +++---
 .../tensor_operants_gen.py                    | 24 ++++++++---------
 paddle/phi/core/flags.cc                      |  6 ++---
 python/paddle/base/framework.py               |  8 +++---
 python/paddle/pir_utils.py                    | 26 +++++++++----------
 test/ir/new_ir/CMakeLists.txt                 |  2 +-
 test/ir/new_ir/test_ir_backward.py            |  6 ++---
 test/prim/new_ir_prim/CMakeLists.txt          |  2 +-
 9 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 1f2f645f97dc8..7a62b5563f30a 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "glog/logging.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
-PHI_DECLARE_bool(enable_new_ir_api);
+PHI_DECLARE_bool(enable_pir_api);
 
 namespace phi {
 class DenseTensor;
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 99b42bee8b73f..f06bee2c884e3 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -28,7 +28,7 @@
 #include "paddle/pir/pass/pass_manager.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
-PHI_DECLARE_bool(enable_new_ir_api);
+PHI_DECLARE_bool(enable_pir_api);
 PHI_DECLARE_bool(new_ir_apply_inplace_pass);
 
 namespace paddle {
@@ -55,7 +55,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     const std::string& job_type = job->Type();
     std::shared_ptr<ProgramDesc> program = nullptr;
     std::shared_ptr<::pir::Program> ir_program = nullptr;
-    if (FLAGS_enable_new_ir_api) {
+    if (FLAGS_enable_pir_api) {
       ir_program = plan_.IrProgram(job_type);
     } else {
       program = std::make_shared<ProgramDesc>(*(plan_.Program(job_type)));
@@ -69,7 +69,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
                                  micro_batch_id,
                                  micro_batch_num));
 
-    if (micro_batch_num > 1 && !FLAGS_enable_new_ir_api) {
+    if (micro_batch_num > 1 && !FLAGS_enable_pir_api) {
       SetColAttrForFeedFetchOps(program, micro_batch_num, micro_batch_id);
     }
 
@@ -80,7 +80,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     // TODO(phlrain) we only support cpu for now
     if (FLAGS_enable_new_ir_in_executor) {
       std::shared_ptr<::pir::Program> base_program = ir_program;
-      if (!FLAGS_enable_new_ir_api) {
+      if (!FLAGS_enable_pir_api) {
         VLOG(6) << "begin to translate" << std::endl;
         base_program = paddle::TranslateLegacyProgramToProgram(*program);
       }
diff --git a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
index 783066f0fc906..0bc050f00d4a0 100644
--- a/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
+++ b/paddle/fluid/prim/api/auto_code_generated/tensor_operants_gen.py
@@ -214,7 +214,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 #include "paddle/fluid/primitive/backend/backend.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 
-PHI_DECLARE_bool(enable_new_ir_api);
+PHI_DECLARE_bool(enable_pir_api);
 
 """
 
@@ -227,7 +227,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 using LazyTensor = paddle::primitive::LazyTensor;
 
 Tensor StaticTensorOperants::add(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::add<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
   } else {
     return paddle::prim::add<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
@@ -235,7 +235,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::subtract(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::subtract<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
   } else {
     return paddle::prim::subtract<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
@@ -243,7 +243,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::multiply(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::scale<LazyTensor>(x, y, 0.0f, true);
   } else {
     return paddle::prim::scale<DescTensor>(x, y, 0.0f, true);
@@ -251,7 +251,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::divide(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::divide<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
   } else {
     return paddle::prim::divide<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
@@ -259,7 +259,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::add(const Scalar& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::add<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
   } else {
     return paddle::prim::add<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
@@ -268,7 +268,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 
 
 Tensor StaticTensorOperants::subtract(const Scalar& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::subtract<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
   } else {
     return paddle::prim::subtract<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
@@ -276,7 +276,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::multiply(const Scalar& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::scale<LazyTensor>(y, x, 0.0f, true);
   } else {
     return paddle::prim::scale<DescTensor>(y, x, 0.0f, true);
@@ -284,7 +284,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::divide(const Scalar& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::divide<LazyTensor>(paddle::primitive::backend::full<LazyTensor>(y.shape(), x, y.dtype(), y.place()), y);
   } else {
     return paddle::prim::divide<DescTensor>(paddle::prim::full<DescTensor>(y.shape(), x, y.dtype(), y.place()), y);
@@ -292,7 +292,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::pow(const Tensor& x, const Tensor& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::elementwise_pow<LazyTensor>(x, y);
   } else {
     return paddle::prim::elementwise_pow<DescTensor>(x, y);
@@ -300,7 +300,7 @@ class StaticTensorOperants : public TensorOperantsBase {
 }
 
 Tensor StaticTensorOperants::pow(const Tensor& x, const Scalar& y) {
-  if (FLAGS_enable_new_ir_api) {
+  if (FLAGS_enable_pir_api) {
     return paddle::primitive::backend::elementwise_pow<LazyTensor>(x, paddle::primitive::backend::full<LazyTensor>(x.shape(), y, x.dtype(), x.place()));
   } else {
     return paddle::prim::elementwise_pow<DescTensor>(x, paddle::prim::full<DescTensor>(x.shape(), y, x.dtype(), x.place()));
@@ -393,7 +393,7 @@ def gene_static_tensor_func_call(self):
         )
         static_func_parameters = self.get_func_args()
 
-        static_tensor_func_call = f"""if (FLAGS_enable_new_ir_api) {{
+        static_tensor_func_call = f"""if (FLAGS_enable_pir_api) {{
     return {backend_static_func_name}({static_func_parameters});
   }} else {{
     return {prim_static_func_name}({static_func_parameters});
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index e02868d5e2c1b..ce03cdb3f4d69 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1278,15 +1278,13 @@ PHI_DEFINE_EXPORTED_bool(enable_new_ir_in_executor,
 
 /**
  * Using new IR API in Python
- * Name: enable_new_ir_api
+ * Name: enable_pir_api
  * Since Version: 2.6.0
  * Value Range: bool, default=false
  * Example:
  * Note: If Ture, New IR API will be used in Python
  */
-PHI_DEFINE_EXPORTED_bool(enable_new_ir_api,
-                         false,
-                         "Enable new IR API in Python");
+PHI_DEFINE_EXPORTED_bool(enable_pir_api, false, "Enable new IR API in Python");
 
 /**
  * Using new IR in executor FLAG
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 0440af415a7d0..d3f17ea6435e9 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -162,8 +162,8 @@ def __init__(self):
         self._in_to_static_mode_ = False
         self._functional_dygraph_context_manager = None
         self._dygraph_tracer_ = _dygraph_tracer_
-        self._use_pir_api_ = get_flags("FLAGS_enable_new_ir_api")[
-            'FLAGS_enable_new_ir_api'
+        self._use_pir_api_ = get_flags("FLAGS_enable_pir_api")[
+            'FLAGS_enable_pir_api'
         ]
 
     def __str__(self):
@@ -340,8 +340,8 @@ def in_dynamic_or_pir_mode():
             >>> print(paddle.framework.in_dynamic_or_pir_mode())
             False
 
-            >>> paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
-            >>> print(paddle.framework.in_dynamic_or_pir_mode())
+            >>> with paddle.pir_utils.IrGuard():
+            ...     print(paddle.framework.in_dynamic_or_pir_mode())
             True
 
     """
diff --git a/python/paddle/pir_utils.py b/python/paddle/pir_utils.py
index a62fe6f61a924..9af825cfcd88b 100644
--- a/python/paddle/pir_utils.py
+++ b/python/paddle/pir_utils.py
@@ -19,11 +19,11 @@
 class IrGuard:
     def __init__(self):
         self.in_dygraph_outside = False
-        old_flag = paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")
-        paddle.base.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        old_flag = paddle.base.framework.get_flags("FLAGS_enable_pir_api")
+        paddle.base.framework.set_flags({"FLAGS_enable_pir_api": False})
         paddle.base.framework.global_var._use_pir_api_ = False
-        if not paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[
-            "FLAGS_enable_new_ir_api"
+        if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
         ]:
             self.old_Program = paddle.static.Program
             self.old_program_guard = paddle.base.program_guard
@@ -34,31 +34,31 @@ def __init__(self):
         else:
             raise RuntimeError(
                 "IrGuard only init when paddle.framework.in_pir_mode(): is false, \
-                please set FLAGS_enable_new_ir_api = false"
+                please set FLAGS_enable_pir_api = false"
             )
         paddle.base.framework.set_flags(old_flag)
         paddle.base.framework.global_var._use_pir_api_ = old_flag[
-            "FLAGS_enable_new_ir_api"
+            "FLAGS_enable_pir_api"
         ]
 
     def __enter__(self):
         self.in_dygraph_outside = paddle.base.framework.in_dygraph_mode()
         if self.in_dygraph_outside:
             paddle.enable_static()
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": True})
         paddle.base.framework.global_var._use_pir_api_ = True
         self._switch_to_pir()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
         paddle.base.framework.global_var._use_pir_api_ = False
         self._switch_to_old_ir()
         if self.in_dygraph_outside:
             paddle.disable_static()
 
     def _switch_to_pir(self):
-        if paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[
-            "FLAGS_enable_new_ir_api"
+        if paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
         ]:
             paddle.framework.set_flags(
                 {"FLAGS_enable_new_ir_in_executor": True}
@@ -76,8 +76,8 @@ def _switch_to_pir(self):
             )
 
     def _switch_to_old_ir(self):
-        if not paddle.base.framework.get_flags("FLAGS_enable_new_ir_api")[
-            "FLAGS_enable_new_ir_api"
+        if not paddle.base.framework.get_flags("FLAGS_enable_pir_api")[
+            "FLAGS_enable_pir_api"
         ]:
             paddle.framework.set_flags(
                 {"FLAGS_enable_new_ir_in_executor": False}
@@ -93,5 +93,5 @@ def _switch_to_old_ir(self):
         else:
             raise RuntimeError(
                 "IrGuard._switch_to_old_ir only work when paddle.framework.in_pir_mode() is false, \
-                please set FLAGS_enable_new_ir_api = false"
+                please set FLAGS_enable_pir_api = false"
             )
diff --git a/test/ir/new_ir/CMakeLists.txt b/test/ir/new_ir/CMakeLists.txt
index e213eaba4c53c..75587db97c088 100644
--- a/test/ir/new_ir/CMakeLists.txt
+++ b/test/ir/new_ir/CMakeLists.txt
@@ -15,7 +15,7 @@ foreach(target ${TEST_INTERP_CASES})
 endforeach()
 
 foreach(target ${TEST_IR_SYSTEM_CASES})
-  py_test_modules(${target} MODULES ${target} ENVS FLAGS_enable_new_ir_api=true)
+  py_test_modules(${target} MODULES ${target} ENVS FLAGS_enable_pir_api=true)
 endforeach()
 
 set_tests_properties(test_pd_inplace_pass PROPERTIES TIMEOUT 60)
diff --git a/test/ir/new_ir/test_ir_backward.py b/test/ir/new_ir/test_ir_backward.py
index acffcf4ee28d6..c604290d34cad 100644
--- a/test/ir/new_ir/test_ir_backward.py
+++ b/test/ir/new_ir/test_ir_backward.py
@@ -38,7 +38,7 @@ def get_ir_program_0():
 
 class TesBackward_1(unittest.TestCase):
     def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
 
     def test_grad(self):
         newir_program = get_ir_program_0()
@@ -155,7 +155,7 @@ def get_ir_program_1():
 
 class TesBackward_2(unittest.TestCase):
     def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
 
     def test_add_n(self):
         newir_program = get_ir_program_1()
@@ -231,7 +231,7 @@ def get_ir_program_2():
 
 class TestBackward_3(unittest.TestCase):
     def tearDown(self) -> None:
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+        paddle.framework.set_flags({"FLAGS_enable_pir_api": False})
 
     def test_basic_network(self):
         newir_program = get_ir_program_2()
diff --git a/test/prim/new_ir_prim/CMakeLists.txt b/test/prim/new_ir_prim/CMakeLists.txt
index 1b37b432d2052..a36e905e0c9f4 100644
--- a/test/prim/new_ir_prim/CMakeLists.txt
+++ b/test/prim/new_ir_prim/CMakeLists.txt
@@ -3,7 +3,7 @@ set(TEST_PRIM_PURE_NEW_IR_CASES test_prim_program test_prim_simpnet
 
 foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
-                  FLAGS_enable_new_ir_api=true)
+                  FLAGS_enable_pir_api=true)
 endforeach()
 
 file(

From c882037892eaa80250a2e06b3f032326a1629661 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Thu, 21 Sep 2023 11:24:42 +0800
Subject: [PATCH 027/115] remove SetTensorDynamicRange in softmax (#57538)

---
 paddle/fluid/inference/tensorrt/convert/softmax_op.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 8e101075768e0..9aefd7fb28b39 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -84,8 +84,6 @@ class SoftMaxOpConverter : public OpConverter {
     }
     layer->setAxes(1 << axes);
 
-    // The trt will not run int for softmax.
-    engine_->SetTensorDynamicRange(input1, 1.0);
     auto output_name = op_desc.Output("Out")[0];
 
     // support 0 or 1 dims input

From 5be4e463cde24dec8cd0cb60833224022f24f90e Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 21 Sep 2023 12:13:30 +0800
Subject: [PATCH 028/115] [PIR]Fix arange op and assign op bug (#57494)

* fix arange kernel selected bug

* revert some code

* fix compile bug
---
 .../fluid/ir_adaptor/translator/op_translator.cc  |  4 ++--
 .../fluid/pir/transforms/pd_op_to_kernel_pass.cc  | 15 +++++++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index e3eeaab4f7d48..b11101de616b8 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -754,8 +754,8 @@ struct AssignValueOpTranscriber : public OpTranscriber {
         attribute_translator(attr_info_maps.at("dtype").type_name, legacy_attr);
     attribute_map["dtype"] = attr_dtype;
 
-    pir::Attribute attr_place =
-        dialect::PlaceAttribute::get(ctx, phi::CPUPlace());
+    pir::Attribute attr_place = dialect::PlaceAttribute::get(
+        ctx, phi::Place(phi::AllocationType::UNDEFINED));
     attribute_map["place"] = attr_place;
 
     int dtype = paddle::get<int>(op_desc.GetAttr("dtype"));
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index d77161992c311..79e6bbe71230e 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -401,7 +401,8 @@ phi::DataType GetKernelDataTypeByYamlInfo(
 phi::Backend GetKernelBackendByYamlInfo(
     const pir::Operation* op,
     const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
-    const dialect::OpYamlInfoParser* op_info_parser) {
+    const dialect::OpYamlInfoParser* op_info_parser,
+    const phi::Place& place) {
   auto& attr_map = op->attributes();
   auto& backend_info = op_info_parser->OpRuntimeInfo().kernel_key_backend;
   phi::Backend kernel_backend = phi::Backend::UNDEFINED;
@@ -465,6 +466,10 @@ phi::Backend GetKernelBackendByYamlInfo(
     }
   }
 
+  if (backend_info.size() > 0 && kernel_backend == phi::Backend::UNDEFINED) {
+    kernel_backend = paddle::experimental::ParseBackend(place);
+  }
+
   return kernel_backend;
 }
 
@@ -518,7 +523,7 @@ phi::KernelKey GetKernelKey(
     kernel_data_type =
         GetKernelDataTypeByYamlInfo(op, map_value_pair, op_info_parser);
     kernel_backend =
-        GetKernelBackendByYamlInfo(op, map_value_pair, op_info_parser);
+        GetKernelBackendByYamlInfo(op, map_value_pair, op_info_parser, place);
 
     // parse all the input tensor
     if (tensor_input_number == 0 || op->isa<paddle::dialect::Full_Op>()) {
@@ -550,7 +555,9 @@ phi::KernelKey GetKernelKey(
     }
   }
 
-  if (op->num_operands() > 0) {
+  if ((kernel_backend == phi::Backend::UNDEFINED ||
+       kernel_data_type == phi::DataType::UNDEFINED) &&
+      op->num_operands() > 0) {
     paddle::experimental::detail::KernelKeyParser kernel_key_parser;
 
     for (size_t i = 0; i < op->num_operands(); ++i) {
@@ -724,7 +731,7 @@ void HandleForSpecialOp(
     pir::IrContext* ctx,
     std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
     std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
-  if (op_item->name() == "pd_op.if") {
+  if (op_item->isa<paddle::dialect::IfOp>()) {
     HandleForIfOp(place, op_item, block, ctx, map_op_pair, map_value_pair);
     return;
   }

From b13dcb85918bb467ebe557093e22bc2482479c93 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:13:23 +0800
Subject: [PATCH 029/115] support pir jit prim (#57561)

---
 .../jit/dy2static/newir_partial_program.py    | 32 +++++++---
 .../jit/dy2static/program_translator.py       | 60 +++++++++++++++++--
 test/prim/new_ir_prim/CMakeLists.txt          |  2 +-
 test/prim/new_ir_prim/test_prim_jit.py        | 58 ++++++++++++++++++
 4 files changed, 138 insertions(+), 14 deletions(-)
 create mode 100644 test/prim/new_ir_prim/test_prim_jit.py

diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py
index 83cb5eed92534..c0da8f35c822a 100644
--- a/python/paddle/jit/dy2static/newir_partial_program.py
+++ b/python/paddle/jit/dy2static/newir_partial_program.py
@@ -642,11 +642,15 @@ def _insert_aggregation_ops_for_var(target_program, var):
     @switch_to_static_graph
     def _append_backward_desc(self, main_program):
         program = main_program
-        # if self._hooker:
-        # program = self._hooker.before_append_backward(program)
+
         targets = list(
             filter(lambda x: isinstance(x, OpResult), self._outputs.tolist())
         )
+        if self._hooker:
+            program, targets = self._hooker.before_append_backward(
+                program, targets
+            )
+            self._outputs = NestSequence(targets, need_check=True)
         inputs = list(
             filter(lambda x: isinstance(x, OpResult), self._inputs.tolist())
         )
@@ -676,11 +680,15 @@ def _append_backward_desc(self, main_program):
                     forward_outputs_grads.append(opres)
                     not_stop_gradient_num += 1
 
-            # TODO: add later.
-            # if self._hooker:
-            # program, start_idx = self._hooker.after_append_backward(
-            # program, start_idx
-            # )
+            if self._hooker:
+                (
+                    program,
+                    forward_end_idx,
+                    targets,
+                ) = self._hooker.after_append_backward(
+                    program, targets, forward_end_idx
+                )
+                self._outputs = NestSequence(targets, need_check=True)
 
             # TODO: add later
             # self.prepare_gradient_aggregation(
@@ -692,6 +700,8 @@ def _append_backward_desc(self, main_program):
         )
         hash_id = paddle.utils._hash_with_id(program, self)
         extra_info = self._program_extra_info.get(hash_id, {})
+        extra_info['forward_inputs'] = inputs
+        extra_info['forward_outputs'] = targets
         extra_info['forward_end_op_idx'] = forward_end_idx
         extra_info['forward_inputs_grads'] = list(
             map(mapping_op_result, grad_info_map)
@@ -791,8 +801,10 @@ def _get_forward_backward_program_form(
         forward_inputs_grads = self.get_program_extra(whole_program)[
             'forward_inputs_grads'
         ]
-        forward_inputs = self._inputs.tolist()
-        forward_outputs = self._outputs.tolist()
+        forward_inputs = self.get_program_extra(whole_program)['forward_inputs']
+        forward_outputs = self.get_program_extra(whole_program)[
+            'forward_outputs'
+        ]
         forward_outputs_grads = self.get_program_extra(whole_program)[
             'forward_outputs_grads'
         ]
@@ -947,9 +959,11 @@ def create_out(var_id):
                 tensor_type = paddle.dtype(8)  # SELECT ROW TENSOR
 
             # TODO(xiongkun): more elegent way to do it.
+
             ir_dtype_2_tensor_dtype = {
                 10: paddle.dtype(5),
             }
+
             out = core.eager.Tensor(
                 ir_dtype_2_tensor_dtype[int(var.dtype)],
                 var.shape,
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 592665596cfef..8eb118852a764 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -21,6 +21,7 @@
 import weakref
 
 import paddle.ir.core as ir_static
+from paddle import decomposition
 from paddle.base import core, framework
 from paddle.base.data_feeder import check_type
 from paddle.base.dygraph.base import (
@@ -42,6 +43,9 @@
     get_buffers,
     get_parameters,
 )
+from .newir_partial_program import (
+    PartialProgramLayerHook as PirPartialProgramLayerHook,
+)
 from .origin_info import (
     attach_origin_info,
     create_and_update_origin_info_map,
@@ -1473,6 +1477,46 @@ def __setattr__(self, key, value):
         return super().__setattr__(key, value)
 
 
+class PirPrimHooker(PirPartialProgramLayerHook):
+    def __init__(self, original_program, backend):
+        self.backend = backend
+        self.custom_vjps = set()
+        with backend_guard(self.backend):
+            if core._is_all_prim_enabled():
+                self.custom_vjps = {
+                    op.name()
+                    for op in original_program.global_block().ops
+                    if core.has_custom_vjp(op)
+                }
+
+    def before_append_backward(self, forward_program, src_vars):
+        with backend_guard(self.backend):
+            if core._is_fwd_prim_enabled():
+                dst_vars = decomposition.decompose(
+                    forward_program, src_vars, blacklist=self.custom_vjps
+                )
+            return forward_program, dst_vars
+
+    def after_append_backward(self, whole_program, src_vars, forward_end_idx):
+        with backend_guard(self.backend):
+            backward_length = (
+                len(whole_program.global_block().ops) - forward_end_idx
+            )
+            if core._is_fwd_prim_enabled() and len(self.custom_vjps) != 0:
+                # only process backward part of block
+                dst_vars = decomposition.decompose(whole_program, src_vars)
+            new_start_index = (
+                len(whole_program.global_block().ops) - backward_length
+            )
+            return whole_program, new_start_index, dst_vars
+
+    def after_infer(self, infer_program, src_vars):
+        with backend_guard(self.backend):
+            if core._is_fwd_prim_enabled():
+                dst_vars = decomposition.decompose(infer_program, src_vars)
+            return infer_program, dst_vars
+
+
 class ProgramCache:
     """
     Wrapper class for the program functions defined by dygraph function.
@@ -1530,7 +1574,10 @@ def _build_once(self, cache_key):
                 raise
 
         backend = cache_key.kwargs['backend']
-        if prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy'], backend):
+        if (
+            prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy'], backend)
+            and not use_pir_api()
+        ):
             for var in concrete_program.main_program.list_vars():
                 if var.type not in NO_SHAPE_VAR_TYPE and -1 in var.shape:
                     warnings.warn(
@@ -1553,9 +1600,14 @@ def _build_once(self, cache_key):
             )
         with backend_guard(backend):
             if core._is_fwd_prim_enabled():
-                partial_program.set_hooker(
-                    PrimHooker(concrete_program.main_program, backend)
-                )
+                if use_pir_api():
+                    partial_program.set_hooker(
+                        PirPrimHooker(concrete_program.main_program, backend)
+                    )
+                else:
+                    partial_program.set_hooker(
+                        PrimHooker(concrete_program.main_program, backend)
+                    )
         return concrete_program, partial_program
 
     def __getitem__(self, item):
diff --git a/test/prim/new_ir_prim/CMakeLists.txt b/test/prim/new_ir_prim/CMakeLists.txt
index a36e905e0c9f4..e1cbcd60f8ee4 100644
--- a/test/prim/new_ir_prim/CMakeLists.txt
+++ b/test/prim/new_ir_prim/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TEST_PRIM_PURE_NEW_IR_CASES test_prim_program test_prim_simpnet
-                                test_prim_custom_vjp)
+                                test_prim_custom_vjp test_prim_jit)
 
 foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
diff --git a/test/prim/new_ir_prim/test_prim_jit.py b/test/prim/new_ir_prim/test_prim_jit.py
new file mode 100644
index 0000000000000..72958eff9a1d7
--- /dev/null
+++ b/test/prim/new_ir_prim/test_prim_jit.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.framework import core
+
+
+class TestDy2staticNewIR(unittest.TestCase):
+    def test_basic_network_backward(self):
+        core._set_prim_all_enabled(True)
+
+        def func(x):
+            x1 = paddle.mean(x)
+            out = paddle.nn.functional.gelu(x1, False)
+            return out
+
+        # ==== dygraph computation ====
+        static_func = paddle.jit.to_static(func)
+        x = paddle.randn((8, 16, 64))
+        x.stop_gradient = False
+        ref_out = func(x) * 2
+        ref_out.backward()
+        ref_grad = x.grad.numpy()
+        x.clear_gradient()
+
+        # ==== to static compuatation ====
+        out = static_func(x)
+        actual_out = out * 2
+        actual_out.backward()
+        actual_grad = x.grad
+        core._set_prim_all_enabled(True)
+
+        np.testing.assert_allclose(
+            ref_out, actual_out.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+        np.testing.assert_allclose(
+            ref_grad, actual_grad.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From e24119c3e6ac49486f83fcdafad0ae6844a7633a Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:14:02 +0800
Subject: [PATCH 030/115] [Fix] fix multi device compile error (#57530)

Add device_id directory when dumping information.
Reduce threads during multi card compilation.
---
 paddle/cinn/backends/compiler.cc              | 41 ++++++++++++++-----
 paddle/cinn/backends/compiler.h               | 19 ++++++---
 paddle/cinn/hlir/framework/graph.cc           | 10 ++++-
 .../cinn/hlir/framework/parallel_compiler.cc  | 38 ++++++++++++-----
 .../cinn/hlir/framework/parallel_compiler.h   | 11 ++++-
 5 files changed, 88 insertions(+), 31 deletions(-)

diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
index 448bef2392a9f..a913a3de86692 100644
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
@@ -45,7 +45,7 @@ using CompilationStatus = hlir::framework::CompilationStatus;
 static constexpr int DebugLogMaxLen = 30000;
 
 void CompilationInfoDumper::DumpLoweredFuncByGroupIndex(
-    const ir::LoweredFunc& lowered_func, const int gidx) {
+    const ir::LoweredFunc& lowered_func, const int gidx, const int device_id) {
   if (FLAGS_cinn_dump_group_lowered_func.empty() ||
       lowered_func.get() == nullptr) {
     return;
@@ -54,34 +54,42 @@ void CompilationInfoDumper::DumpLoweredFuncByGroupIndex(
   content << lowered_func;
   Dump(FLAGS_cinn_dump_group_lowered_func,
        gidx,
+       device_id,
        "lowered_function.txt",
        content.str());
 }
 
 void CompilationInfoDumper::DumpSourceCodeByGroupIndex(
-    const std::string& source_code, const int gidx) {
+    const std::string& source_code, const int gidx, const int device_id) {
   if (FLAGS_cinn_dump_group_source_code.empty()) {
     return;
   }
-  Dump(FLAGS_cinn_dump_group_source_code, gidx, "source_code.cu", source_code);
+  Dump(FLAGS_cinn_dump_group_source_code,
+       gidx,
+       device_id,
+       "source_code.cu",
+       source_code);
 }
 
 void CompilationInfoDumper::DumpPtxCodeByGroupIndex(
-    const std::string& source_ptx, const int gidx) {
+    const std::string& source_ptx, const int gidx, const int device_id) {
   if (FLAGS_cinn_dump_group_ptx.empty()) {
     return;
   }
-  Dump(FLAGS_cinn_dump_group_ptx, gidx, "source_ptx.ptx", source_ptx);
+  Dump(
+      FLAGS_cinn_dump_group_ptx, gidx, device_id, "source_ptx.ptx", source_ptx);
 }
 
 void CompilationInfoDumper::DumpInstructionByGroupIndex(
     const std::unique_ptr<cinn::hlir::framework::Instruction>& instr,
-    const int gidx) {
+    const int gidx,
+    const int device_id) {
   if (FLAGS_cinn_dump_group_instruction.empty() || instr.get() == nullptr) {
     return;
   }
   Dump(FLAGS_cinn_dump_group_instruction,
        gidx,
+       device_id,
        "instruction.txt",
        instr->DumpInstruction());
 }
@@ -99,6 +107,7 @@ void CompilationInfoDumper::DumpLoweredFunc() {
     }
     Dump(FLAGS_cinn_dump_group_lowered_func,
          idx,
+         device_id_,
          "lowered_function.txt",
          content.str());
   }
@@ -115,7 +124,11 @@ void CompilationInfoDumper::DumpSourceCode() {
     } else {
       dump_str = "[No source code generated]\n\n" + info_.Message(idx);
     }
-    Dump(FLAGS_cinn_dump_group_source_code, idx, "source_code.cu", dump_str);
+    Dump(FLAGS_cinn_dump_group_source_code,
+         idx,
+         device_id_,
+         "source_code.cu",
+         dump_str);
   }
 }
 
@@ -130,7 +143,8 @@ void CompilationInfoDumper::DumpPtxCode() {
     } else {
       dump_str = "[No source ptxs generated]\n\n" + info_.Message(idx);
     }
-    Dump(FLAGS_cinn_dump_group_ptx, idx, "source_ptx.ptx", dump_str);
+    Dump(
+        FLAGS_cinn_dump_group_ptx, idx, device_id_, "source_ptx.ptx", dump_str);
   }
 }
 
@@ -145,16 +159,21 @@ void CompilationInfoDumper::DumpInstruction() {
     } else {
       dump_str = "[No instruction generated]\n\n" + info_.Message(idx);
     }
-    Dump(FLAGS_cinn_dump_group_instruction, idx, "instruction.txt", dump_str);
+    Dump(FLAGS_cinn_dump_group_instruction,
+         idx,
+         device_id_,
+         "instruction.txt",
+         dump_str);
   }
 }
 
 void CompilationInfoDumper::Dump(const std::string& base_path,
                                  const int idx,
+                                 const int device_id,
                                  const std::string& file_name,
                                  const std::string& content) {
-  auto dump_path =
-      utils::StringFormat("%s/fusion_group_%d", base_path.c_str(), idx);
+  auto dump_path = utils::StringFormat(
+      "%s/device_%d/fusion_group_%d", base_path.c_str(), device_id, idx);
   if (!hlir::framework::MakeDirectory(
           dump_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
     LOG(WARNING) << "Failed to make directory: \"" << dump_path
diff --git a/paddle/cinn/backends/compiler.h b/paddle/cinn/backends/compiler.h
index 8b09573b522e4..a468193d4d85a 100644
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
@@ -43,8 +43,9 @@ namespace backends {
  */
 class CompilationInfoDumper {
  public:
-  explicit CompilationInfoDumper(const hlir::framework::CompilationResult& info)
-      : info_(info) {
+  explicit CompilationInfoDumper(const hlir::framework::CompilationResult& info,
+                                 const int device_id)
+      : info_(info), device_id_(device_id) {
     DumpLoweredFunc();
     DumpSourceCode();
     DumpPtxCode();
@@ -52,14 +53,18 @@ class CompilationInfoDumper {
   }
 
   static void DumpLoweredFuncByGroupIndex(const ir::LoweredFunc& lowered_func,
-                                          const int gidx);
+                                          const int gidx,
+                                          const int device_id);
   static void DumpSourceCodeByGroupIndex(const std::string& source_code,
-                                         const int gidx);
+                                         const int gidx,
+                                         const int device_id);
   static void DumpPtxCodeByGroupIndex(const std::string& source_ptx,
-                                      const int gidx);
+                                      const int gidx,
+                                      const int device_id);
   static void DumpInstructionByGroupIndex(
       const std::unique_ptr<cinn::hlir::framework::Instruction>& instr,
-      const int gidx);
+      const int gidx,
+      const int device_id);
 
  private:
   void DumpLoweredFunc();
@@ -68,10 +73,12 @@ class CompilationInfoDumper {
   void DumpInstruction();
   static void Dump(const std::string& base_path,
                    const int idx,
+                   const int device_id,
                    const std::string& file_name,
                    const std::string& content);
 
   const hlir::framework::CompilationResult& info_;
+  const int device_id_;
 };
 
 class SourceCodePrint {
diff --git a/paddle/cinn/hlir/framework/graph.cc b/paddle/cinn/hlir/framework/graph.cc
index 3f81b8b91906d..4c8d166e4cc4a 100644
--- a/paddle/cinn/hlir/framework/graph.cc
+++ b/paddle/cinn/hlir/framework/graph.cc
@@ -18,6 +18,9 @@
 #include <sstream>
 
 #include "paddle/cinn/hlir/framework/visualize_helper.h"
+#ifdef CINN_WITH_CUDA
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#endif
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/string.h"
 
@@ -315,9 +318,14 @@ void Graph::VisualizeGroupedGraph(
   const auto& group_dots = VisualizeGroups(groups, fetch_var_ids);
   for (int idx = 0; idx < groups.size(); ++idx) {
     // Create fusion_group_x folder
+    int device_id = 0;
+#ifdef CINN_WITH_CUDA
+    cudaGetDevice(&device_id);
+#endif
     auto group_path =
-        utils::StringFormat("%s/fusion_group_%d",
+        utils::StringFormat("%s/device_%d/fusion_group_%d",
                             FLAGS_cinn_fusion_groups_graphviz_dir.c_str(),
+                            device_id,
                             idx);
     if (!MakeDirectory(group_path,
                        S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.cc b/paddle/cinn/hlir/framework/parallel_compiler.cc
index bae6048477623..3a15f7c42bef0 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
@@ -80,8 +80,13 @@ void ParallelCompiler::SplitTask() {
   CHECK(context_->lowered_funcs.empty() ||
         context_->graph->fusion_groups.size() ==
             context_->lowered_funcs.size());
-  for (int i = 0; i < context_->graph->fusion_groups.size(); ++i) {
-    tasks_.emplace_back(i, this, context_);
+  int device_id = 0;
+#ifdef CINN_WITH_CUDA
+  CUDA_CALL(cudaGetDevice(&device_id));
+#endif
+  for (int group_id = 0; group_id < context_->graph->fusion_groups.size();
+       ++group_id) {
+    tasks_.emplace_back(device_id, group_id, this, context_);
   }
 }
 
@@ -126,11 +131,20 @@ void ParallelCompiler::RunTask() {
 }
 
 void ParallelCompiler::LaunchTask() {
+  int device_id = 0;
+#ifdef CINN_WITH_CUDA
+  CUDA_CALL(cudaGetDevice(&device_id));
+#endif
+  int num_threads = FLAGS_cinn_parallel_compile_thread;
+#if defined(PADDLE_WITH_DISTRIBUTE)
+  if (device_id > 0) {
+    num_threads = 1;
+  }
+#endif
   // multi thread compilation
   std::vector<std::thread> threads;
-  VLOG(4) << "Compile with " << FLAGS_cinn_parallel_compile_thread
-          << " threads";
-  for (int idx = 1; idx < FLAGS_cinn_parallel_compile_thread; ++idx) {
+  VLOG(4) << "Compile with " << num_threads << " threads";
+  for (int idx = 1; idx < num_threads; ++idx) {
     threads.emplace_back(&ParallelCompiler::RunTask, this);
   }
 
@@ -208,7 +222,7 @@ void ParallelCompiler::Task::Lowering() {
     pcompiler->result_.SetLoweredFuncs(group_id, lowered_funcs);
   }
   backends::CompilationInfoDumper::DumpLoweredFuncByGroupIndex(
-      pcompiler->result_.LoweredFuncs(group_id).front(), group_id);
+      pcompiler->result_.LoweredFuncs(group_id).front(), group_id, device_id);
 }
 
 void ParallelCompiler::Task::CodegenAndJit() {
@@ -239,8 +253,8 @@ void ParallelCompiler::Task::CodegenAndJit() {
     }
     CHECK(!cuda_c.empty()) << "Compile CUDA C code failed from device module:\n"
                            << dmodule;
-    backends::CompilationInfoDumper::DumpSourceCodeByGroupIndex(cuda_c,
-                                                                group_id);
+    backends::CompilationInfoDumper::DumpSourceCodeByGroupIndex(
+        cuda_c, group_id, device_id);
     pcompiler->result_.SetSourceCode(group_id, cuda_c);
 
     cinn::backends::SourceCodePrint::GetInstance()->write(cuda_c);
@@ -249,7 +263,8 @@ void ParallelCompiler::Task::CodegenAndJit() {
     backends::nvrtc::Compiler compiler;
     auto ptx = compiler(cuda_c);
     CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n" << cuda_c;
-    backends::CompilationInfoDumper::DumpPtxCodeByGroupIndex(ptx, group_id);
+    backends::CompilationInfoDumper::DumpPtxCodeByGroupIndex(
+        ptx, group_id, device_id);
     pcompiler->result_.SetSourcePtx(group_id, ptx);
     // load cumodule
     cumodule = std::make_unique<CUDAModule>(ptx,
@@ -260,7 +275,7 @@ void ParallelCompiler::Task::CodegenAndJit() {
     // register kernel
     backends::RuntimeSymbols symbols;
     for (auto& fn : dmodule.functions()) {
-      auto cufunc = cumodule->GetFunction(0, fn->name);
+      auto cufunc = cumodule->GetFunction(device_id, fn->name);
       CHECK(cufunc);
       symbols.RegisterVar(fn->name + "_ptr_", reinterpret_cast<void*>(cufunc));
     }
@@ -291,7 +306,8 @@ void ParallelCompiler::Task::BuildInstruction() {
   instr->SetLoweredFunc(reinterpret_cast<void*>(fn_ptr), group->GetFuncName());
 
   instr->Finalize();
-  backends::CompilationInfoDumper::DumpInstructionByGroupIndex(instr, group_id);
+  backends::CompilationInfoDumper::DumpInstructionByGroupIndex(
+      instr, group_id, device_id);
   pcompiler->result_.SetInstruction(group_id, std::move(instr));
 }
 
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.h b/paddle/cinn/hlir/framework/parallel_compiler.h
index e78ee99404867..df0d39ebe2afc 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.h
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
@@ -36,8 +36,14 @@ namespace framework {
 class ParallelCompiler {
  public:
   struct Task {
-    Task(int group_id, ParallelCompiler* compiler, CompilationContext* context)
-        : group_id(group_id), pcompiler(compiler), context(context) {}
+    Task(int device_id,
+         int group_id,
+         ParallelCompiler* compiler,
+         CompilationContext* context)
+        : device_id(device_id),
+          group_id(group_id),
+          pcompiler(compiler),
+          context(context) {}
     void Lowering();
     void CodegenAndJit();
     void BuildInstruction();
@@ -48,6 +54,7 @@ class ParallelCompiler {
     CompilationStatus status = CompilationStatus::SUCCESS;
     std::string message;
 
+    const int device_id;
     int group_id;
 
     std::unique_ptr<backends::ExecutionEngine> engine;

From be463d319530ec7ae1b5d4d5ecb7f1d3d0dbb445 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:17:21 +0800
Subject: [PATCH 031/115] [PIR]add all add ,  mul newir optest (#57533)

* add all add mul newir optest

* add sub optest

* delete sub
---
 test/legacy_test/test_elementwise_add_op.py | 21 +++--------
 test/legacy_test/test_elementwise_mul_op.py | 42 ++++++---------------
 2 files changed, 17 insertions(+), 46 deletions(-)

diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index 279d1997f160e..8bacfc9a45cfd 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -212,7 +212,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
@@ -738,27 +738,16 @@ def init_input_output(self):
         self.out = self.x + self.y
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-        )
+        self.check_grad(['X', 'Y'], 'Out', check_new_ir=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-        )
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-        )
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False)
 
 
 class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp):
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index 86f4e764916e0..8013eb0baaf15 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -128,24 +128,13 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-        )
+        self.check_grad(['X', 'Y'], 'Out', check_new_ir=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-        )
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-        )
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False)
 
 
 class TestElementwiseMulOp_ZeroDim1(ElementwiseMulOp):
@@ -196,7 +185,7 @@ def setUp(self):
         self.if_enable_cinn()
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(['X', 'Y'], 'Out', check_prim=True, check_new_ir=True)
@@ -274,6 +263,7 @@ def test_check_output(self):
         self.check_output(
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph,
         )
 
     def test_check_grad_normal(self):
@@ -282,6 +272,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph,
         )
 
     def test_check_grad_ingore_x(self):
@@ -291,6 +282,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph,
         )
 
     def test_check_grad_ingore_y(self):
@@ -300,6 +292,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=self.check_dygraph,
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph,
         )
 
     def init_input_attr_output(self):
@@ -527,27 +520,16 @@ def init_input_output(self):
         self.out = self.x * self.y
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(
-            ['X', 'Y'],
-            'Out',
-        )
+        self.check_grad(['X', 'Y'], 'Out', check_new_ir=False)
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set("X"),
-        )
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"), check_new_ir=False)
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            no_grad_set=set('Y'),
-        )
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'), check_new_ir=False)
 
 
 class TestRealComplexElementwiseMulOp(TestComplexElementwiseMulOp):

From 6d9d73a230d65c871da4487c30a5c82558056833 Mon Sep 17 00:00:00 2001
From: Ruibin Cheung <beinggod@foxmail.com>
Date: Thu, 21 Sep 2023 14:35:35 +0800
Subject: [PATCH 032/115] [Custom Device] change the dlopen flag of custom
 device dylibs (#57544)

---
 paddle/fluid/platform/init.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index eae360c146df5..a3fff528f7903 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -142,7 +142,7 @@ void LoadCustomDevice(const std::string &library_dir) {
   LOG(INFO) << "Try loading custom device libs from: [" << library_dir << "]";
   std::vector<std::string> libs = phi::ListAllLibraries(library_dir);
   for (const auto &lib_path : libs) {
-    auto dso_handle = dlopen(lib_path.c_str(), RTLD_NOW);
+    auto dso_handle = dlopen(lib_path.c_str(), RTLD_LAZY);
     PADDLE_ENFORCE_NOT_NULL(
         dso_handle,
         platform::errors::InvalidArgument(

From c5a70065ac0baa817903595749e5b5e425bccc1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Thu, 21 Sep 2023 14:46:08 +0800
Subject: [PATCH 033/115] move ir_nodes_collector from namespace optim to
 ir_utils (#57535)

---
 paddle/cinn/ast_gen_ius/tensor_group.cc       |   2 +-
 .../cinn/auto_schedule/analysis/analyze_ir.cc |  37 ++--
 .../search_space/auto_gen_rule/auto_bind.cc   |   4 +-
 .../search_space/auto_gen_rule/auto_inline.cc |  16 +-
 .../search_space/auto_gen_rule/auto_unroll.cc |   2 +-
 paddle/cinn/backends/codegen_cuda_dev.cc      |   2 +-
 paddle/cinn/backends/llvm/codegen_x86.cc      |   2 +-
 paddle/cinn/common/arithmatic.cc              |   4 +-
 paddle/cinn/common/cas.cc                     |   2 +-
 paddle/cinn/common/ir_util.cc                 |  10 +-
 .../cinn/hlir/framework/op_lowering_util.cc   |   7 +-
 paddle/cinn/hlir/pe/ir_schedule_pe.cc         |   6 +-
 paddle/cinn/ir/ir.cc                          |   2 +-
 paddle/cinn/ir/lowered_func.cc                |  14 +-
 paddle/cinn/ir/schedule/ir_schedule.cc        |  61 ++++---
 paddle/cinn/ir/schedule/ir_schedule_util.cc   | 172 +++++++++---------
 paddle/cinn/ir/tensor.cc                      |   6 +-
 paddle/cinn/ir/test/collect_ir_nodes_test.cc  |   3 +-
 paddle/cinn/ir/utils/ir_nodes_collector.cc    |   4 +-
 paddle/cinn/ir/utils/ir_nodes_collector.h     |   4 +-
 paddle/cinn/lang/lower.cc                     |  56 +++---
 paddle/cinn/lang/lower_impl.cc                |  19 +-
 paddle/cinn/lang/lower_tensor_group.cc        |   4 +-
 paddle/cinn/optim/buffer_assign.cc            |   2 +-
 paddle/cinn/optim/compute_inline_expand.cc    |   9 +-
 .../optim/eliminate_broadcast_in_forloop.cc   |   8 +-
 paddle/cinn/optim/transform_gpu_forloop.cc    |   2 +-
 paddle/cinn/optim/vectorize_loops.cc          |   8 +-
 paddle/cinn/poly/domain.cc                    |   4 +-
 paddle/cinn/poly/stage.cc                     |  15 +-
 30 files changed, 253 insertions(+), 234 deletions(-)

diff --git a/paddle/cinn/ast_gen_ius/tensor_group.cc b/paddle/cinn/ast_gen_ius/tensor_group.cc
index 2b604f2c383cb..e8b9c6a345e72 100644
--- a/paddle/cinn/ast_gen_ius/tensor_group.cc
+++ b/paddle/cinn/ast_gen_ius/tensor_group.cc
@@ -30,7 +30,7 @@ TensorGroup::TensorGroup(const std::vector<ir::Tensor>& tensors) {
 
   for (auto& tensor : tensors) {
     output_tensor_names_.insert(tensor->name);
-    std::set<ir::Expr> used_tensors = ir::CollectIRNodes(
+    std::set<ir::Expr> used_tensors = ir::ir_utils::CollectIRNodes(
         tensor->body(), [](const Expr* x) { return x->as_tensor(); });
     for (const Expr& x : used_tensors) {
       const ir::Tensor to_dep = x.as_tensor_ref();
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index 17aad495b246a..da2c063d9c00d 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -54,29 +54,30 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
     return;
   }
 
-  ir::CollectIRNodesWithoutTensor(sche_block->body, [&](const Expr* x) {
-    const ir::Load* load_expr = x->As<ir::Load>();
-    if (load_expr != nullptr) {
-      const ir::Tensor t = load_expr->tensor.as_tensor_ref();
-      sche_block->read_buffers.emplace_back(
-          ir::BufferRange(t->buffer, IndicesToVars(load_expr->indices)));
-      return false;
-    }
-    const ir::Store* store_expr = x->As<ir::Store>();
-    if (store_expr != nullptr) {
-      const ir::Tensor t = store_expr->tensor.as_tensor_ref();
-      sche_block->write_buffers.emplace_back(
-          ir::BufferRange(t->buffer, IndicesToVars(store_expr->indices)));
-      return false;
-    }
-    return false;
-  });
+  ir::ir_utils::CollectIRNodesWithoutTensor(
+      sche_block->body, [&](const Expr* x) {
+        const ir::Load* load_expr = x->As<ir::Load>();
+        if (load_expr != nullptr) {
+          const ir::Tensor t = load_expr->tensor.as_tensor_ref();
+          sche_block->read_buffers.emplace_back(
+              ir::BufferRange(t->buffer, IndicesToVars(load_expr->indices)));
+          return false;
+        }
+        const ir::Store* store_expr = x->As<ir::Store>();
+        if (store_expr != nullptr) {
+          const ir::Tensor t = store_expr->tensor.as_tensor_ref();
+          sche_block->write_buffers.emplace_back(
+              ir::BufferRange(t->buffer, IndicesToVars(store_expr->indices)));
+          return false;
+        }
+        return false;
+      });
 }
 
 bool ContainsNodeType(ir::Expr expr,
                       const std::unordered_set<ir::IrNodeTy>& node_types) {
   std::set<ir::Expr> collection =
-      ir::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) {
         return node_types.find(x->node_type()) != node_types.end();
       });
   return !collection.empty();
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
index 06215d98d8b27..62c92c9e38fca 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -31,7 +31,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
   const auto& loop_var = for_node->loop_var;
   // collect cases where the loop_var used in one of reduce axis in underneath
   // ScheduleBlock
-  auto used_for_reduce_axis = ir::CollectIRNodesWithoutTensor(
+  auto used_for_reduce_axis = ir::ir_utils::CollectIRNodesWithoutTensor(
       for_node->body, [&loop_var](const Expr* x) {
         const auto* block_realize = x->As<ir::ScheduleBlockRealize>();
         if (!block_realize) return false;
@@ -46,7 +46,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
           const ir::Expr& binding = block_realize->iter_values[i];
           if (iter_var->is_reduce_axis ||
               iter_var->name.substr(0, 6) == "reduce") {
-            auto used_exprs = ir::CollectIRNodesWithoutTensor(
+            auto used_exprs = ir::ir_utils::CollectIRNodesWithoutTensor(
                 binding, [&loop_var](const Expr* x) {
                   const ir::_Var_* var = x->As<ir::_Var_>();
                   if (var &&
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
index 946947611f35d..16eca6d677b89 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
@@ -49,7 +49,7 @@ bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr,
   ir::Expr root = ir_sch->GetRootBlock(sche_block_realize_expr);
 
   // Check the schedule block to be inlined is not a reduce tensor.
-  std::set<ir::Expr> find_store = ir::CollectIRNodesWithoutTensor(
+  std::set<ir::Expr> find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&](const Expr* x) { return x->As<ir::Store>(); });
   if (find_store.size() != 1UL) {
     return false;
@@ -76,17 +76,19 @@ bool AutoInline::CanInlineIntoConsumer(const Expr& sche_block_realize_expr,
   }
 
   // Check this schedule block is the only writer of the tensor.
-  find_store = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-    return x->As<ir::Store>() &&
-           (x->As<ir::Store>()->tensor).as_tensor_ref()->name == tensor->name;
-  });
+  find_store =
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+        return x->As<ir::Store>() &&
+               (x->As<ir::Store>()->tensor).as_tensor_ref()->name ==
+                   tensor->name;
+      });
   if (find_store.size() != 1UL) {
     return false;
   }
   // Check there is no overlap between the buffers the schedule block reads and
   // writes.
-  std::set<ir::Expr> find_load =
-      ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+  std::set<ir::Expr> find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) {
         return x->As<ir::Load>() && x->As<ir::Load>()->tensor == tensor_expr;
       });
   if (!find_load.empty()) {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
index 946bd9e9d7730..000203306c1a1 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
@@ -56,7 +56,7 @@ bool AutoUnroll::MeetCondition(const ir::ScheduleBlock* schedule_block) const {
     return false;
   };
 
-  auto find_target_exprs = ir::CollectIRNodesWithoutTensor(
+  auto find_target_exprs = ir::ir_utils::CollectIRNodesWithoutTensor(
       schedule_block->body,
       [&has_reduce_iter, &has_nonserial_loop](const Expr* x) {
         return has_reduce_iter(x) || has_nonserial_loop(x);
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index e33154f0c0129..1f6f5bba154aa 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -90,7 +90,7 @@ std::vector<Expr> CodeGenCUDA_Dev::GenerateBufferAliasExprs(
                                        temp_buffers.end());
   // prepare temp buffer alias
   std::vector<Expr> buffer_alias;
-  auto tensors = ir::CollectIRNodes(op->body, [&](const Expr *x) {
+  auto tensors = ir::ir_utils::CollectIRNodes(op->body, [&](const Expr *x) {
     return x->as_tensor() && x->as_tensor()->buffer.defined() &&
            temp_buffer_set.count(x->as_tensor()->buffer);
   });
diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc
index ccae02ac5746b..9de0603e2c9e2 100644
--- a/paddle/cinn/backends/llvm/codegen_x86.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
@@ -98,7 +98,7 @@ void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) {
                                              llvm::Function::PrivateLinkage,
                                              "__parallel_lambda",
                                              m_);
-  std::vector<std::string> vars = ir::CollectUndefinedVars(&body);
+  std::vector<std::string> vars = ir::ir_utils::CollectUndefinedVars(&body);
   uint64_t nbytes;
   auto* data = PackVars(vars, &nbytes);
 
diff --git a/paddle/cinn/common/arithmatic.cc b/paddle/cinn/common/arithmatic.cc
index 16b1d9cb8e8a5..af6656317aa11 100644
--- a/paddle/cinn/common/arithmatic.cc
+++ b/paddle/cinn/common/arithmatic.cc
@@ -126,7 +126,7 @@ GiNaC::ex ExprToGinacConverter::BuildHelper(ir::Expr expr) {
 
 GiNaC::ex ExprToGinacConverter::operator()(Expr expr) {
   // TODO(Superjomn) Replace this with common::IsPureMath(
-  auto complex_nodes = CollectIRNodes(expr, [](const Expr* n) {
+  auto complex_nodes = ir::ir_utils::CollectIRNodes(expr, [](const Expr* n) {
     return n->As<Block>() ||    //
            n->As<PolyFor>() ||  //
            n->As<EQ>() ||       //
@@ -262,7 +262,7 @@ bool IsPureMath(Expr expr) {
       IrNodeTy ::Minus,
   });
 
-  auto complex_nodes = ir::CollectIRNodes(expr, [&](const Expr* n) {
+  auto complex_nodes = ir::ir_utils::CollectIRNodes(expr, [&](const Expr* n) {
     return !valid_node_tys.count(n->node_type());
   });
 #ifdef CINN_DEBUG
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index 6264c5b12d453..bf1c9092ed5eb 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -1868,7 +1868,7 @@ bool IsExprCasCompatible(Expr expr) {
     return expr->As<Add>() || expr->As<Sub>() || expr->As<Mul>() ||
            expr->As<Div>();
   };
-  return ir::CollectIRNodes(expr, teller).empty();
+  return ir::ir_utils::CollectIRNodes(expr, teller).empty();
 }
 
 // Partially divide a by b. e.g. (2x+y)/2 => x + y/2
diff --git a/paddle/cinn/common/ir_util.cc b/paddle/cinn/common/ir_util.cc
index f0f219ee105f7..4f000af1e8f0d 100644
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
@@ -249,8 +249,8 @@ Expr or_all(const std::vector<Expr> &conds) {
 }
 
 void CheckTensorUniqueInExpr(Expr expr) {
-  auto tensor_uniq =
-      ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_tensor(); });
+  auto tensor_uniq = ir::ir_utils::CollectIRNodes(
+      expr, [](const Expr *x) { return x->as_tensor(); });
   absl::flat_hash_map<std::string, const ir::_Tensor_ *> tensor_names;
   for (auto &t : tensor_uniq) {
     auto *tp = t.as_tensor();
@@ -269,9 +269,9 @@ void CheckBufferUniqueInExpr(Expr expr) {
   // the buffers exists in tensor and lowered functions.
   CheckTensorUniqueInExpr(expr);
 
-  auto tensors =
-      ir::CollectIRNodes(expr, [](const Expr *x) { return x->as_tensor(); });
-  auto funcs = ir::CollectIRNodes(
+  auto tensors = ir::ir_utils::CollectIRNodes(
+      expr, [](const Expr *x) { return x->as_tensor(); });
+  auto funcs = ir::ir_utils::CollectIRNodes(
       expr, [](const Expr *x) { return x->as_lowered_func(); });
 
   absl::flat_hash_map<std::string, const ir::_Buffer_ *> buffer_name;
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
index e7a4412202d87..1af9ef0576351 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -1046,7 +1046,7 @@ void LoopAssignReduce(
     auto first_reduce_loop = rloops.front();
     // collect if
     auto if_checker = [](const Expr* x) { return x->As<ir::IfThenElse>(); };
-    auto if_set = ir::CollectIRNodesWithoutTensor(
+    auto if_set = ir::ir_utils::CollectIRNodesWithoutTensor(
         first_reduce_loop.As<ir::For>()->body, if_checker);
     std::string reduce_block_name = reducer_data->id();
     for (auto if_expr : if_set) {
@@ -1056,10 +1056,11 @@ void LoopAssignReduce(
                        ->schedule_block.As<ir::ScheduleBlock>()
                        ->name == reduce_block_name;
       };
-      auto blocks_in_if = ir::CollectIRNodesWithoutTensor(if_expr, checker);
+      auto blocks_in_if =
+          ir::ir_utils::CollectIRNodesWithoutTensor(if_expr, checker);
       if (!blocks_in_if.empty()) {
         ir::Expr condition = if_expr.As<ir::IfThenElse>()->condition;
-        auto indices_in_if = ir::CollectIRNodesWithoutTensor(
+        auto indices_in_if = ir::ir_utils::CollectIRNodesWithoutTensor(
             condition, [](const Expr* x) { return x->As<ir::_Var_>(); });
         for (int i = 0; i < rloops.size(); ++i) {
           std::string var_name = rloops[i].As<ir::For>()->loop_var->name;
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
index 3677025aaedaa..6600905b083c1 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -633,7 +633,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
     // simplify reshape index
     auto hand_write_simplify = [](std::vector<ir::Expr> loops, ir::Expr block) {
       // check exist select.
-      auto find_select = ir::CollectIRNodesInOrder(
+      auto find_select = ir::ir_utils::CollectIRNodesInOrder(
           block, [&](const Expr *x) { return x->As<ir::Select>(); });
       if (find_select.size() > 0) {
         return;
@@ -667,7 +667,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
         index = index + ir::Expr(schedule_block->iter_vars[idx]) * stride;
       }
 
-      auto exprs = ir::CollectIRNodesInOrder(
+      auto exprs = ir::ir_utils::CollectIRNodesInOrder(
           block, [&](const Expr *x) { return x->As<ir::Load>(); });
       CHECK_EQ(exprs.size(), 1);
       auto load = exprs.front().As<ir::Load>();
@@ -709,7 +709,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
       break;
     }
 
-    auto exprs = ir::CollectIRNodesInOrder(
+    auto exprs = ir::ir_utils::CollectIRNodesInOrder(
         block, [&](const Expr *x) { return x->As<ir::Load>(); });
     for (auto expr : exprs) {
       auto load = expr.As<ir::Load>();
diff --git a/paddle/cinn/ir/ir.cc b/paddle/cinn/ir/ir.cc
index 5427a14afa5ba..7911f3ea14bba 100644
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
@@ -535,7 +535,7 @@ std::vector<const Expr *> PolyFor::expr_fields() const {
 }
 
 Expr PolyFor::ExtractExtent() const {
-  auto nodes = CollectIRNodes(condition, [&](const Expr *e) {
+  auto nodes = ir::ir_utils::CollectIRNodes(condition, [&](const Expr *e) {
     return e->As<NE>() ||   //
            e->As<EQ>() ||   //
            e->As<Min>() ||  //
diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
index 5a897e7c334a5..ec5f4b2e64ce6 100644
--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -82,7 +82,7 @@ std::vector<const Expr*> _LoweredFunc_::expr_fields() const { return {&body}; }
 
 void _LoweredFunc_::PrepareCudaAxisInfoFromBody() {
   std::set<Expr> bound_for_exprs =
-      ir::CollectIRNodes(body, [](const Expr* expr) {
+      ir::ir_utils::CollectIRNodes(body, [](const Expr* expr) {
         const ir::For* for_expr = expr->As<ir::For>();
         return for_expr != nullptr && for_expr->is_binded();
       });
@@ -208,7 +208,7 @@ void _LoweredFunc_::AllocTempBuffer() {}
 void _LoweredFunc_::PrepareBufferCastExprs(bool with_expr_gen_tensor) {
   buffer_data_cast_exprs.clear();
   // collect write.
-  auto write_teller = ir::CollectTensorNeedsWrite(&body);
+  auto write_teller = ir::ir_utils::CollectTensorNeedsWrite(&body);
 
   auto tensors = CollectAllTensorReference(with_expr_gen_tensor);
   std::sort(tensors.begin(),
@@ -248,7 +248,7 @@ std::vector<Expr> _LoweredFunc_::CudaAliasVarExprs() const {
   }
   // collect write.
   std::vector<Expr> res;
-  auto write_teller = ir::CollectTensorNeedsWrite(&body);
+  auto write_teller = ir::ir_utils::CollectTensorNeedsWrite(&body);
 
   auto tensors = CollectAllTensorReference();
   std::sort(tensors.begin(),
@@ -403,11 +403,11 @@ std::vector<Tensor> _LoweredFunc_::CollectAllTensorReference(
     bool with_expr_gen_tensor) const {
   std::set<Expr> tensor_exprs =
       with_expr_gen_tensor
-          ? ir::CollectIRNodes(
+          ? ir::ir_utils::CollectIRNodes(
                 body, [](const Expr* expr) { return expr->As<ir::_Tensor_>(); })
-          : ir::CollectIRNodesWithoutTensor(body, [](const Expr* expr) {
-              return expr->As<ir::_Tensor_>();
-            });
+          : ir::ir_utils::CollectIRNodesWithoutTensor(
+                body,
+                [](const Expr* expr) { return expr->As<ir::_Tensor_>(); });
 
   std::vector<Tensor> tensors;
   // remove the duplicate tensor by their name.
diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc
index 78ce98564dbdc..fab8a53deb121 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule.cc
@@ -767,7 +767,7 @@ struct CacheWriteRewriter : public ir::IRMutator<> {
     rewriter(&info->cache_block);
     rewriter.mutate_cache_block = false;
     rewriter(&new_root);
-    auto find_tensor = ir::CollectIRNodesWithoutTensor(
+    auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_root,
         [&](const Expr* x) {
           return x->As<Store>() &&
@@ -775,7 +775,7 @@ struct CacheWriteRewriter : public ir::IRMutator<> {
         },
         true);
     if (!find_tensor.empty()) {
-      auto find_store = ir::CollectIRNodesWithoutTensor(
+      auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
           (*find_tensor.begin()), [&](const Expr* x) {
             return x->As<Load>() &&
                    (x->As<Load>()->tensor == Expr(info->write_tensor));
@@ -864,7 +864,7 @@ struct ChangeBodyToBlock : public ir::IRMutator<> {
 
 DeviceAPI ScheduleImpl::GetDeviceAPI() const {
   auto exprs = this->GetModule().GetExprs();
-  auto find_for_nodes = ir::CollectIRNodesWithoutTensor(
+  auto find_for_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
       exprs.front(), [&](const Expr* x) { return x->As<ir::For>(); }, true);
   CHECK(!find_for_nodes.empty());
   return (*find_for_nodes.begin()).As<ir::For>()->device_api;
@@ -925,7 +925,7 @@ Expr ScheduleImpl::CacheWrite(const Expr& block,
           ->schedule_block.As<ScheduleBlock>()
           ->body);
 
-  auto find_cache_block = ir::CollectIRNodesWithoutTensor(
+  auto find_cache_block = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) {
         return x->As<ir::ScheduleBlockRealize>() &&
@@ -937,9 +937,10 @@ Expr ScheduleImpl::CacheWrite(const Expr& block,
   CHECK(info.write_tensor->buffer.defined());
 
   // Replace buffer
-  auto all_tensors = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-    return x->as_tensor() && x->as_tensor()->buffer.defined();
-  });
+  auto all_tensors =
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+        return x->as_tensor() && x->as_tensor()->buffer.defined();
+      });
 
   for (auto i : all_tensors) {
     if (i.as_tensor()->name != info.write_tensor->name &&
@@ -1119,7 +1120,7 @@ Expr ScheduleImpl::Reorder(const Expr& block,
 Expr ScheduleImpl::GetRootBlock(const Expr& expr) const {
   auto exprs = this->GetModule().GetExprs();
   for (auto& it_expr : exprs) {
-    auto find_expr = ir::CollectIRNodesWithoutTensor(
+    auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor(
         it_expr,
         [&](const Expr* x) {
           return x->node_type() == expr.node_type() && *x == expr;
@@ -1198,20 +1199,21 @@ struct LoopReconstructor : public ir::IRMutator<> {
     // Replace the copied Tensor object with the original Tensor object,
     // to ensure that the same Tensor in a AST is the same object.
     std::unordered_map<std::string, ir::Expr> tensors_map;
-    ir::CollectIRNodesWithoutTensor(loop_, [&tensors_map](const Expr* x) {
-      if (x->as_tensor()) {
-        tensors_map.insert({x->as_tensor()->name, *x});
-        return true;
-      }
-      return false;
-    });
-    auto find_store = ir::CollectIRNodesWithoutTensor(
+    ir::ir_utils::CollectIRNodesWithoutTensor(
+        loop_, [&tensors_map](const Expr* x) {
+          if (x->as_tensor()) {
+            tensors_map.insert({x->as_tensor()->name, *x});
+            return true;
+          }
+          return false;
+        });
+    auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_loop_, [](const Expr* x) { return x->As<ir::Store>(); });
     for (auto store : find_store) {
       store.As<ir::Store>()->tensor =
           tensors_map.at(store.As<ir::Store>()->tensor.as_tensor()->name);
     }
-    auto find_load = ir::CollectIRNodesWithoutTensor(
+    auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_loop_, [](const Expr* x) { return x->As<ir::Load>(); });
     for (auto load : find_load) {
       load.As<ir::Load>()->tensor =
@@ -1275,7 +1277,7 @@ void ScheduleImpl::SetBuffer(Expr& block,
                              const std::string& memory_type,
                              bool fixed) {
   CHECK(block.As<ir::ScheduleBlockRealize>());
-  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+  auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor(
       block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_tensor.size(), 1U)
       << "One block should only have one Store node!(except for root block)";
@@ -1286,7 +1288,7 @@ void ScheduleImpl::SetBuffer(Expr& block,
   auto exprs = this->GetModule().GetExprs();
   for (auto& it_expr : exprs) {
     auto find_tensor =
-        ir::CollectIRNodesWithoutTensor(it_expr, [&](const Expr* x) {
+        ir::ir_utils::CollectIRNodesWithoutTensor(it_expr, [&](const Expr* x) {
           return x->as_tensor() &&
                  (x->as_tensor()->name == tensor.as_tensor_ref()->name ||
                   x->as_tensor()->name ==
@@ -1328,7 +1330,7 @@ void ScheduleImpl::MergeExprs() {
                              ->body);
   VLOG(3) << "Before merging, exprs[0] is : " << exprs[0];
   for (int i = 1; i < exprs.size(); ++i) {
-    auto root_block = ir::CollectIRNodesWithoutTensor(
+    auto root_block = ir::ir_utils::CollectIRNodesWithoutTensor(
         exprs[i],
         [&](const Expr* x) {
           return x->As<ir::ScheduleBlockRealize>() &&
@@ -1437,7 +1439,7 @@ void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
   auto body = block_loops.at(loops.size() - 1).As<ir::For>()->body;
   // collect if
   auto if_checker = [](const Expr* x) { return x->As<ir::IfThenElse>(); };
-  auto if_set = ir::CollectIRNodesWithoutTensor(body, if_checker);
+  auto if_set = ir::ir_utils::CollectIRNodesWithoutTensor(body, if_checker);
   for (auto if_expr : if_set) {
     auto checker = [block_name](const Expr* x) {
       return x->As<ir::ScheduleBlockRealize>() &&
@@ -1445,7 +1447,8 @@ void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
                      ->schedule_block.As<ScheduleBlock>()
                      ->name == block_name;
     };
-    if (ir::CollectIRNodesWithoutTensor(if_expr, checker, true).size() > 0) {
+    if (ir::ir_utils::CollectIRNodesWithoutTensor(if_expr, checker, true)
+            .size() > 0) {
       result =
           IfThenElse::Make(if_expr.As<ir::IfThenElse>()->condition, result);
       break;
@@ -1582,7 +1585,7 @@ bool ComputeInliner::BodyPatternAllowInline() {
     return false;
   }
   CHECK(inlined_store_.As<Store>());
-  auto find_vars = ir::CollectIRNodesWithoutTensor(
+  auto find_vars = ir::ir_utils::CollectIRNodesWithoutTensor(
       inlined_store_, [&](const Expr* x) { return x->as_var(); });
   std::set<Var, CompVar> vars_set;
   for (auto& i : find_vars) vars_set.insert(i.as_var_ref());
@@ -1650,7 +1653,7 @@ bool ReverseComputeInliner::BodyPatternAllowInline() {
   CHECK(inlined_store_.As<Store>());
   CHECK(inlined_load_.As<Load>());
   CHECK(target_store_.As<Store>());
-  auto find_vars = ir::CollectIRNodesWithoutTensor(
+  auto find_vars = ir::ir_utils::CollectIRNodesWithoutTensor(
       inlined_store_, [&](const Expr* x) { return x->as_var(); });
   std::set<Var, CompVar> vars_set;
   for (auto& i : find_vars) vars_set.insert(i.as_var_ref());
@@ -2036,7 +2039,7 @@ void ScheduleImpl::FlattenLoops(const std::vector<Expr>& loops,
       }
     }
 
-    auto exprs = ir::CollectIRNodesInOrder(
+    auto exprs = ir::ir_utils::CollectIRNodesInOrder(
         schedule_block->body,
         [&](const Expr* x) { return x->As<ir::Store>() || x->As<ir::Load>(); });
     // reverse exprs from last to first.
@@ -2185,7 +2188,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   std::set<std::string> used_target_loop_vars;
   for (auto& iter_val : new_iter_values) {
     auto find_partial_loop =
-        ir::CollectIRNodesWithoutTensor(iter_val, [&](const Expr* x) {
+        ir::ir_utils::CollectIRNodesWithoutTensor(iter_val, [&](const Expr* x) {
           if (x->as_var()) used_target_loop_vars.insert(x->as_var_ref()->name);
           return x->as_var();
         });
@@ -2194,7 +2197,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   std::vector<Expr> used_target_loops;
   auto expr_copy = optim::IRCopy(expr);
   for (auto& var : used_target_loop_vars) {
-    auto find_loop_var = ir::CollectIRNodesWithoutTensor(
+    auto find_loop_var = ir::ir_utils::CollectIRNodesWithoutTensor(
         expr_copy,
         [&](const Expr* x) {
           return x->As<ir::For>() && x->As<ir::For>()->loop_var->name == var &&
@@ -2222,7 +2225,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   } else {
     CHECK(old_iter_values[changed_loop_num].as_var());
     auto old_var = old_iter_values[changed_loop_num].as_var_ref();
-    auto find_partial_loop = ir::CollectIRNodesWithoutTensor(
+    auto find_partial_loop = ir::ir_utils::CollectIRNodesWithoutTensor(
         expr,
         [&](const Expr* x) {
           return x->As<ir::For>() &&
@@ -2232,7 +2235,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
         true);
     CHECK_EQ(find_partial_loop.size(), 1U);
     new_loop = optim::IRCopy(*find_partial_loop.begin());
-    auto find_schedule_block = ir::CollectIRNodesWithoutTensor(
+    auto find_schedule_block = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_loop,
         [&](const Expr* x) { return x->As<ir::ScheduleBlockRealize>(); },
         true);
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index b4000ff212cad..45779788e9c54 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -40,7 +40,7 @@ namespace ir {
 
 Tensor GetTensor(const Expr& block) {
   CHECK(block.As<ir::ScheduleBlockRealize>());
-  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+  auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor(
       block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_tensor.size(), 1U)
       << "One block should only have one Store node!(except for root block)";
@@ -52,13 +52,13 @@ Tensor GetTensor(const Expr& block) {
 
 Tensor GetReadTensor(const Expr& block, int index) {
   CHECK(block.As<ir::ScheduleBlockRealize>());
-  auto find_tensor = ir::CollectIRNodesWithoutTensor(
+  auto find_tensor = ir::ir_utils::CollectIRNodesWithoutTensor(
       block, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_tensor.size(), 1U)
       << "One block should only have one Store node!(except for root block)";
   std::vector<Tensor> res;
   auto find_read_tensor =
-      ir::CollectIRNodesWithoutTensor(block, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(block, [&](const Expr* x) {
         if (x->As<ir::Load>())
           res.push_back(x->As<ir::Load>()->tensor.as_tensor_ref());
         return x->As<ir::Load>();
@@ -86,41 +86,43 @@ void SetCudaAxisInfo(Expr* lowered_func) {
   auto func_body = lowered_func->as_lowered_func_ref()->body;
   CudaAxisInfo info;
 
-  auto block_nodes = ir::CollectIRNodes(func_body, [&](const Expr* x) {
-    if (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid()) {
-      auto bind_info = x->As<ir::For>()->bind_info();
-      info.set_valid(true);
-      if (bind_info.for_type == ForType::GPUThread) {
-        CHECK(common::is_zero(x->As<ir::For>()->min));
-        CHECK(x->As<ir::For>()->extent.is_constant());
-        int range = x->As<ir::For>()->extent.get_constant();
-        range = range > info.block_dim(bind_info.offset)
-                    ? range
-                    : info.block_dim(bind_info.offset);
-        VLOG(3) << "Set block dim[" << bind_info.offset << "] with range "
-                << range;
-        info.set_block_dim(bind_info.offset, range);
-      } else if (bind_info.for_type == ForType::GPUBlock) {
-        CHECK(common::is_zero(x->As<ir::For>()->min));
-        CHECK(x->As<ir::For>()->extent.is_constant());
-        int range = x->As<ir::For>()->extent.get_constant();
-        range = range > info.grid_dim(bind_info.offset)
-                    ? range
-                    : info.grid_dim(bind_info.offset);
-        info.set_grid_dim(bind_info.offset, range);
-        VLOG(3) << "Set grid dim[" << bind_info.offset << "] with range "
-                << range;
-      } else {
-        LOG(FATAL) << "The for loop's bind info should be gpu block or thread!";
-      }
-    }
-    return (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid());
-  });
+  auto block_nodes =
+      ir::ir_utils::CollectIRNodes(func_body, [&](const Expr* x) {
+        if (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid()) {
+          auto bind_info = x->As<ir::For>()->bind_info();
+          info.set_valid(true);
+          if (bind_info.for_type == ForType::GPUThread) {
+            CHECK(common::is_zero(x->As<ir::For>()->min));
+            CHECK(x->As<ir::For>()->extent.is_constant());
+            int range = x->As<ir::For>()->extent.get_constant();
+            range = range > info.block_dim(bind_info.offset)
+                        ? range
+                        : info.block_dim(bind_info.offset);
+            VLOG(3) << "Set block dim[" << bind_info.offset << "] with range "
+                    << range;
+            info.set_block_dim(bind_info.offset, range);
+          } else if (bind_info.for_type == ForType::GPUBlock) {
+            CHECK(common::is_zero(x->As<ir::For>()->min));
+            CHECK(x->As<ir::For>()->extent.is_constant());
+            int range = x->As<ir::For>()->extent.get_constant();
+            range = range > info.grid_dim(bind_info.offset)
+                        ? range
+                        : info.grid_dim(bind_info.offset);
+            info.set_grid_dim(bind_info.offset, range);
+            VLOG(3) << "Set grid dim[" << bind_info.offset << "] with range "
+                    << range;
+          } else {
+            LOG(FATAL)
+                << "The for loop's bind info should be gpu block or thread!";
+          }
+        }
+        return (x->As<ir::For>() && x->As<ir::For>()->bind_info().valid());
+      });
   lowered_func->as_lowered_func_ref()->cuda_axis_info = info;
 }
 
 bool Contains(const Expr& container, const Expr& expr) {
-  auto find_expr = ir::CollectIRNodesWithoutTensor(
+  auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor(
       container,
       [&](const Expr* x) {
         return (x->node_type() == expr.node_type() && *x == expr);
@@ -283,13 +285,13 @@ void CHECKRfactorValidation(const Expr& rf_loop, int rf_axis) {
   auto* rf_for = rf_loop.As<ir::For>();
   CHECK(rf_for) << "Expr param of Rfactor must be For node! Please check.";
   // check the rf_loop only has one schedule block
-  auto block_nodes = ir::CollectIRNodesWithoutTensor(
+  auto block_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
       rf_loop,
       [&](const Expr* x) { return x->As<ScheduleBlockRealize>(); },
       true);
   CHECK_EQ(block_nodes.size(), 1U)
       << "Rfactor Loop should only have one schedule block";
-  auto find_store = ir::CollectIRNodesWithoutTensor(
+  auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       rf_loop, [&](const Expr* x) { return x->As<Store>(); }, true);
   CHECK_EQ(find_store.size(), 1U);
   auto indice = find_store.begin()->As<Store>()->indices;
@@ -322,9 +324,9 @@ void CHECKRfactorValidation(const Expr& rf_loop, int rf_axis) {
 }
 
 std::vector<Expr> GetLoopsOfExpr(const Expr& expr, const Expr& root) {
-  auto loop_nodes = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-    return x->As<ir::For>() && Contains(*x, expr);
-  });
+  auto loop_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
+      root,
+      [&](const Expr* x) { return x->As<ir::For>() && Contains(*x, expr); });
   std::vector<Expr> result(loop_nodes.begin(), loop_nodes.end());
   if (result.empty())
     LOG(FATAL) << "Didn't find expr's : \n"
@@ -439,8 +441,8 @@ Expr GetNthAccessExpr(const Expr& block, int index, bool is_write) {
                           ->body;
   if (is_write) {
     std::vector<Expr> find_store_vec;
-    auto find_store =
-        ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+    auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
+        compute_body, [&](const Expr* x) {
           if (x->As<ir::Store>()) find_store_vec.push_back(*x);
           return x->As<ir::Store>();
         });
@@ -450,8 +452,8 @@ Expr GetNthAccessExpr(const Expr& block, int index, bool is_write) {
     return store_index;
   } else {
     std::vector<Expr> find_load_vec;
-    auto find_load =
-        ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+    auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
+        compute_body, [&](const Expr* x) {
           if (x->As<ir::Load>()) find_load_vec.push_back(*x);
           return x->As<ir::Load>();
         });
@@ -526,7 +528,7 @@ void FindInsertionPoint(const Expr& root, CacheBlockInfo* info, bool is_write) {
   Expr find_tensor =
       is_write ? Expr(info->write_tensor) : Expr(info->read_tensor);
   auto find_produce_read =
-      ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
         return x->As<ir::Store>() && x->As<ir::Store>()->tensor == find_tensor;
       });
 
@@ -675,9 +677,9 @@ Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
   // In each IfThenElse node, find the vars its condition depends on.
   for (auto& if_expr : if_nodes) {
     CHECK(if_expr.As<IfThenElse>());
-    auto var_set =
-        ir::CollectIRNodes(if_expr.As<IfThenElse>()->condition,
-                           [&](const Expr* x) { return x->as_var(); });
+    auto var_set = ir::ir_utils::CollectIRNodes(
+        if_expr.As<IfThenElse>()->condition,
+        [&](const Expr* x) { return x->as_var(); });
     std::set<std::string> var_name_set;
     for (auto& i : var_set) var_name_set.insert(i.as_var()->name);
     condition_vars.push_back(var_name_set);
@@ -863,7 +865,7 @@ std::vector<Expr> GetProducers(const Expr& block, const Expr& root) {
   std::string block_name = block.As<ir::ScheduleBlockRealize>()
                                ->schedule_block.As<ir::ScheduleBlock>()
                                ->name;
-  ir::CollectIRNodesWithoutTensor(
+  ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&producer_tensor_names, &block_name](const Expr* x) {
         auto* load = x->As<ir::Load>();
         if (load) {
@@ -879,15 +881,15 @@ std::vector<Expr> GetProducers(const Expr& block, const Expr& root) {
 
   // traverse each of other blocks and filter those ones which contain at least
   // one producer tensor;
-  auto find_blocks =
-      ir::CollectIRNodesWithoutTensor(root, [&block, &root](const Expr* x) {
+  auto find_blocks = ir::ir_utils::CollectIRNodesWithoutTensor(
+      root, [&block, &root](const Expr* x) {
         return x->As<ir::ScheduleBlockRealize>() && *x != block && *x != root;
       });
   for (auto&& cur : find_blocks) {
     auto* cur_block = cur.As<ir::ScheduleBlockRealize>()
                           ->schedule_block.As<ir::ScheduleBlock>();
     CHECK(cur_block) << "block result should be a ScheduleBlockRealize";
-    auto find_stores = ir::CollectIRNodesWithoutTensor(
+    auto find_stores = ir::ir_utils::CollectIRNodesWithoutTensor(
         cur_block->body, [&producer_tensor_names](const Expr* x) {
           return x->As<ir::Store>() &&
                  producer_tensor_names.count(
@@ -905,27 +907,29 @@ std::vector<Expr> GetConsumers(const Expr& block, const Expr& root) {
   std::string block_tensor = GetTensor(block)->name;
   if (IsReduceInitTensorName(block_tensor)) {
     std::string consumer_name = GetOriginalReduceTensorName(block_tensor);
-    auto consumer = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-      return x->As<ir::ScheduleBlockRealize>() &&
-             x->As<ir::ScheduleBlockRealize>()
-                     ->schedule_block.As<ir::ScheduleBlock>()
-                     ->name == consumer_name;
-    });
+    auto consumer =
+        ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+          return x->As<ir::ScheduleBlockRealize>() &&
+                 x->As<ir::ScheduleBlockRealize>()
+                         ->schedule_block.As<ir::ScheduleBlock>()
+                         ->name == consumer_name;
+        });
     CHECK_EQ(consumer.size(), 1);
     return {*consumer.begin()};
   }
 
-  auto find_block = ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
-    return x->As<ir::ScheduleBlockRealize>() && *x != block && *x != root;
-  });
+  auto find_block =
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+        return x->As<ir::ScheduleBlockRealize>() && *x != block && *x != root;
+      });
   for (auto& i : find_block) {
     CHECK(i.As<ir::ScheduleBlockRealize>()
               ->schedule_block.As<ir::ScheduleBlock>());
     auto block_body = i.As<ir::ScheduleBlockRealize>()
                           ->schedule_block.As<ir::ScheduleBlock>()
                           ->body;
-    auto find_load =
-        ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+    auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
+        block_body, [&](const Expr* x) {
           return x->As<ir::Load>() &&
                  x->As<ir::Load>()->tensor.as_tensor_ref()->name ==
                      block_tensor;
@@ -938,7 +942,7 @@ std::vector<Expr> GetConsumers(const Expr& block, const Expr& root) {
 void CheckComputeAtValidation(const Expr& block,
                               const Expr& loop,
                               const Expr& root) {
-  auto find_block = ir::CollectIRNodesWithoutTensor(
+  auto find_block = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) {
         return x->As<ir::ScheduleBlockRealize>() && *x == block;
@@ -946,13 +950,13 @@ void CheckComputeAtValidation(const Expr& block,
       true);
   CHECK(!find_block.empty()) << "Didn't find block in root!";
 
-  auto find_loop = ir::CollectIRNodesWithoutTensor(
+  auto find_loop = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) { return x->As<ir::For>() && *x == loop; },
       true);
   CHECK(!find_loop.empty()) << "Didn't find loop in root!";
 
-  auto find_block_in_loop = ir::CollectIRNodesWithoutTensor(
+  auto find_block_in_loop = ir::ir_utils::CollectIRNodesWithoutTensor(
       loop,
       [&](const Expr* x) {
         return x->As<ir::ScheduleBlockRealize>() && *x == block;
@@ -1005,10 +1009,10 @@ std::vector<IterRange> CalculateRequiredRegions(
 
   std::set<Expr> provided_nodes;
   if (is_store_provided) {
-    provided_nodes = ir::CollectIRNodesWithoutTensor(
+    provided_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
         block, [&](const Expr* x) { return x->As<ir::Store>(); });
   } else {
-    provided_nodes = ir::CollectIRNodesWithoutTensor(
+    provided_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
         block, [&](const Expr* x) { return x->As<ir::Load>(); });
   }
 
@@ -1036,7 +1040,7 @@ std::vector<IterRange> CalculateRequiredRegions(
 
       // Notice that we look for For nodes in loop's body instead of loop
       // itself.
-      auto find_loops = ir::CollectIRNodesWithoutTensor(
+      auto find_loops = ir::ir_utils::CollectIRNodesWithoutTensor(
           loop.As<ir::For>()->body, [&](const Expr* x) {
             return x->As<ir::For>() && Contains(*x, req_block);
           });
@@ -1052,15 +1056,15 @@ std::vector<IterRange> CalculateRequiredRegions(
 
       std::set<Expr> required_nodes;
       if (is_store_provided) {
-        required_nodes =
-            ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+        required_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
+            block_body, [&](const Expr* x) {
               return x->As<ir::Load>() &&
                      x->As<ir::Load>()->tensor.as_tensor_ref()->name ==
                          provided_tensor_name;
             });
       } else {
-        required_nodes =
-            ir::CollectIRNodesWithoutTensor(block_body, [&](const Expr* x) {
+        required_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
+            block_body, [&](const Expr* x) {
               return x->As<ir::Store>() &&
                      x->As<ir::Store>()->tensor.as_tensor_ref()->name ==
                          provided_tensor_name;
@@ -1105,7 +1109,7 @@ std::vector<IterRange> CalculateRequiredRegions(
             block.As<ir::ScheduleBlockRealize>()->iter_values[i].is_constant());
       if (block.As<ir::ScheduleBlockRealize>()->iter_values[i].as_var()) {
         auto find_for_loops =
-            ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+            ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
               return x->As<ir::For>() &&
                      x->As<ir::For>()->loop_var->name ==
                          block.As<ir::ScheduleBlockRealize>()
@@ -1134,13 +1138,13 @@ Expr CheckComputeInlineValidationAndGetStore(const Expr& schedule_block,
                           ->schedule_block.As<ir::ScheduleBlock>()
                           ->body;
   // 1. Check the schedule block to be inlined is not a reduce tensor.
-  auto find_store = ir::CollectIRNodesWithoutTensor(
+  auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_store.size(), 1U);
   Expr tensor = (*find_store.begin()).As<ir::Store>()->tensor;
   CHECK(!tensor.as_tensor_ref()->is_reduce_tensor());
   // 2. Check this schedule block is the only writer of the tensor.
-  find_store = ir::CollectIRNodesWithoutTensor(
+  find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) {
         return x->As<ir::Store>() &&
@@ -1151,8 +1155,8 @@ Expr CheckComputeInlineValidationAndGetStore(const Expr& schedule_block,
   CHECK_EQ(find_store.size(), 1U);
   // 3. Check there is no overlap between the buffers the schedule block reads
   // and writes.
-  auto find_load =
-      ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+  auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) {
         return x->As<ir::Load>() && x->As<ir::Load>()->tensor == tensor;
       });
   CHECK(find_load.empty());
@@ -1166,14 +1170,14 @@ std::tuple<Expr, Expr, Expr> CheckReverseComputeInlineValidationAndGetExprs(
                           ->schedule_block.As<ir::ScheduleBlock>()
                           ->body;
   // 1. Check the schedule block to be reverse inlined is not a reduce tensor.
-  auto find_inlined_load = ir::CollectIRNodesWithoutTensor(
+  auto find_inlined_load = ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&](const Expr* x) { return x->As<ir::Load>(); }, true);
   CHECK_EQ(find_inlined_load.size(), 1U);
   Expr tensor = (*find_inlined_load.begin()).As<ir::Load>()->tensor;
   CHECK(!tensor.as_tensor_ref()->is_reduce_tensor());
   auto inlined_load = *find_inlined_load.begin();
   // 2. Check this schedule block is the only reader of the tensor.
-  auto find_load = ir::CollectIRNodesWithoutTensor(
+  auto find_load = ir::ir_utils::CollectIRNodesWithoutTensor(
       root,
       [&](const Expr* x) {
         return x->As<ir::Load>() &&
@@ -1184,20 +1188,20 @@ std::tuple<Expr, Expr, Expr> CheckReverseComputeInlineValidationAndGetExprs(
   CHECK_EQ(find_load.size(), 1U);
   // 3. Check there is no overlap between the buffers the schedule block reads
   // and writes.
-  auto find_store =
-      ir::CollectIRNodesWithoutTensor(compute_body, [&](const Expr* x) {
+  auto find_store = ir::ir_utils::CollectIRNodesWithoutTensor(
+      compute_body, [&](const Expr* x) {
         return x->As<ir::Store>() && x->As<ir::Store>()->tensor == tensor;
       });
   CHECK(find_store.empty());
   // 4. Get store that will be inlined.
   auto find_inlined_store =
-      ir::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(root, [&](const Expr* x) {
         return x->As<ir::Store>() && x->As<ir::Store>()->tensor == tensor;
       });
   CHECK_EQ(find_inlined_store.size(), 1U);
   auto inlined_store = *find_inlined_store.begin();
   // 5. Get target store.
-  auto find_target_store = ir::CollectIRNodesWithoutTensor(
+  auto find_target_store = ir::ir_utils::CollectIRNodesWithoutTensor(
       compute_body, [&](const Expr* x) { return x->As<ir::Store>(); }, true);
   CHECK_EQ(find_target_store.size(), 1U);
   auto target_store = *find_target_store.begin();
@@ -1206,7 +1210,7 @@ std::tuple<Expr, Expr, Expr> CheckReverseComputeInlineValidationAndGetExprs(
 
 bool ContainVar(const std::vector<Expr>& exprs, const std::string& var_name) {
   for (auto& expr : exprs) {
-    auto find_expr = ir::CollectIRNodesWithoutTensor(
+    auto find_expr = ir::ir_utils::CollectIRNodesWithoutTensor(
         expr,
         [&](const Expr* x) {
           return x->As<_Var_>() && x->As<_Var_>()->name == var_name;
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index 3297b714630e1..8ad8b9878d4bc 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -60,7 +60,7 @@ std::set<std::string> _Tensor_::GetDependTensorNames() const {
   std::set<std::string> names;
 
   auto add_depend_tensors_from_expr = [&](Expr expr) {
-    auto tensors = CollectIRNodes(expr, [&](const Expr *x) {
+    auto tensors = ir::ir_utils::CollectIRNodes(expr, [&](const Expr *x) {
       return x->as_tensor() && x->as_tensor()->name != this->name;
     });
     for (auto &e : tensors) {
@@ -515,7 +515,7 @@ bool _Tensor_::IsDependOnStatement(absl::string_view statement) {
 std::set<std::string> _Tensor_::DependingTensorNames() {
   std::set<std::string> res;
   if (body().defined()) {
-    auto depend_tensors = ir::CollectIRNodes(
+    auto depend_tensors = ir::ir_utils::CollectIRNodes(
         body(), [](const Expr *x) -> bool { return x->as_tensor(); });
     for (const auto &x : depend_tensors) {
       if (x.get() != this) {
@@ -538,7 +538,7 @@ std::vector<Var> _Tensor_::axis_with_reduce() const {
 }
 
 bool _Tensor_::Uses(const Tensor &other) const {
-  auto loads = ir::CollectIRNodes(body(), [&](const Expr *x) {
+  auto loads = ir::ir_utils::CollectIRNodes(body(), [&](const Expr *x) {
     auto *loadn = x->As<ir::Load>();
     if (!loadn) return false;
     return loadn->tensor.as_tensor()->name == other->name;
diff --git a/paddle/cinn/ir/test/collect_ir_nodes_test.cc b/paddle/cinn/ir/test/collect_ir_nodes_test.cc
index 82441b4a005c7..d380b4475e37d 100644
--- a/paddle/cinn/ir/test/collect_ir_nodes_test.cc
+++ b/paddle/cinn/ir/test/collect_ir_nodes_test.cc
@@ -19,6 +19,7 @@
 
 namespace cinn {
 namespace ir {
+namespace ir_utils {
 
 TEST(CollectIRNodes, basic0) {
   Expr C = Expr(1) + 2;
@@ -57,6 +58,6 @@ TEST(CollectIRNodes, basic) {
       CollectIRNodes(fn_body, [](const Expr* x) { return x->as_tensor(); });
   auto exprs = CollectIRNodes(fn_body, [](const Expr* x) { return x; });
 }
-
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc
index d44c3701b5ac2..7d7373a6b9ee8 100644
--- a/paddle/cinn/ir/utils/ir_nodes_collector.cc
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc
@@ -21,8 +21,8 @@
 namespace cinn {
 namespace ir {
 
+namespace ir_utils {
 namespace {
-
 struct IrNodesCollector : public IRVisitorRequireReImpl<void> {
   using teller_t = std::function<bool(const Expr*)>;
   using handler_t = std::function<void(const Expr*)>;
@@ -317,6 +317,6 @@ std::set<std::string> CollectTensorNeedsWrite(const Expr* e) {
   collector.Visit(e);
   return tensor_written;
 }
-
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.h b/paddle/cinn/ir/utils/ir_nodes_collector.h
index 0f8a390e1ade7..7bfb1b3b4e6b3 100644
--- a/paddle/cinn/ir/utils/ir_nodes_collector.h
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.h
@@ -18,7 +18,7 @@
 
 namespace cinn {
 namespace ir {
-
+namespace ir_utils {
 /**
  * Collect the IR Nodes(without duplication) in the expression.
  */
@@ -83,6 +83,6 @@ std::vector<std::string> CollectUndefinedVars(const Expr* e);
  * Collect the Tensor Nodes which will be Writed by Store or Call Nodes
  */
 std::set<std::string> CollectTensorNeedsWrite(const Expr* e);
-
+}  // namespace ir_utils
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc
index 58ae00fe8771e..0b91b6d598ac7 100644
--- a/paddle/cinn/lang/lower.cc
+++ b/paddle/cinn/lang/lower.cc
@@ -40,7 +40,7 @@ std::vector<ir::Argument> GetArgs(
   std::vector<ir::Argument> res;
   std::map<std::string, std::set<const ir::Load*>> name2loads;
   std::map<std::string, std::set<const ir::Store*>> name2stores;
-  auto load_or_store_nodes = ir::CollectIRNodesWithoutTensor(
+  auto load_or_store_nodes = ir::ir_utils::CollectIRNodesWithoutTensor(
       func_body,
       [&](const Expr* x) { return x->As<ir::Store>() || x->As<ir::Load>(); });
 
@@ -102,7 +102,7 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
       name_to_buffer;  // used to avoid duplication.
 
   auto all_temp_tensors =
-      ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
         return x->as_tensor() && x->as_tensor()->buffer.defined() &&
                (!tensor_group.Contain(x->as_tensor()->name) &&
                 ((!buffer_arg_names.count(x->as_tensor()->buffer->name) &&
@@ -145,7 +145,7 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
       name_to_buffer;  // used to avoid duplication.
 
   auto all_temp_tensors =
-      ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
         return x->as_tensor() && x->as_tensor()->buffer.defined() &&
                (!stage_map->Lookup(x->as_tensor()->name) ||
                 !stage_map[x->as_tensor()]->inlined()) &&
@@ -165,17 +165,18 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
     }
   }
   // visit the ir body and update the map of name_to_buffer
-  auto update_map = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
-    if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
-      auto buffer_name = x->as_tensor()->buffer->name;
-      if (name_to_buffer.count(buffer_name) &&
-          x->as_tensor()->buffer->numel() <
-              name_to_buffer[buffer_name]->numel()) {
-        name_to_buffer[buffer_name] = x->as_tensor()->buffer;
-      }
-    }
-    return x->as_tensor() && x->as_tensor()->buffer.defined();
-  });
+  auto update_map =
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+        if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
+          auto buffer_name = x->as_tensor()->buffer->name;
+          if (name_to_buffer.count(buffer_name) &&
+              x->as_tensor()->buffer->numel() <
+                  name_to_buffer[buffer_name]->numel()) {
+            name_to_buffer[buffer_name] = x->as_tensor()->buffer;
+          }
+        }
+        return x->as_tensor() && x->as_tensor()->buffer.defined();
+      });
 
   std::vector<ir::Buffer> temp_buffers;
   for (auto& i : name_to_buffer) temp_buffers.push_back(i.second);
@@ -195,7 +196,7 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<ir::Argument>& args,
       name_to_buffer;  // used to avoid duplication.
 
   auto all_temp_tensors =
-      ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
         return x->as_tensor() && x->as_tensor()->buffer.defined() &&
                (!buffer_arg_names.count(x->as_tensor()->buffer->name) ||
                 utils::Endswith(x->as_tensor()->buffer->name, "temp_buffer"));
@@ -212,17 +213,18 @@ std::vector<ir::Buffer> GetTempBuffers(const std::vector<ir::Argument>& args,
     }
   }
   // visit the ir body and update the map of name_to_buffer
-  auto update_map = ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
-    if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
-      auto buffer_name = x->as_tensor()->buffer->name;
-      if (name_to_buffer.count(buffer_name) &&
-          x->as_tensor()->buffer->numel() <
-              name_to_buffer[buffer_name]->numel()) {
-        name_to_buffer[buffer_name] = x->as_tensor()->buffer;
-      }
-    }
-    return x->as_tensor() && x->as_tensor()->buffer.defined();
-  });
+  auto update_map =
+      ir::ir_utils::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+        if (x->as_tensor() && x->as_tensor()->buffer.defined()) {
+          auto buffer_name = x->as_tensor()->buffer->name;
+          if (name_to_buffer.count(buffer_name) &&
+              x->as_tensor()->buffer->numel() <
+                  name_to_buffer[buffer_name]->numel()) {
+            name_to_buffer[buffer_name] = x->as_tensor()->buffer;
+          }
+        }
+        return x->as_tensor() && x->as_tensor()->buffer.defined();
+      });
 
   std::vector<ir::Buffer> temp_buffers;
   for (auto& i : name_to_buffer) temp_buffers.push_back(i.second);
@@ -250,7 +252,7 @@ void InitReduceTensor(StageMap stages,
     tensor->InitReduction(stages, target);
   }
   auto uninited_reduce_tensors =
-      ir::CollectIRNodes(tensor->body(), [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodes(tensor->body(), [&](const Expr* x) {
         return x && x->defined() && x->as_tensor() &&
                x->as_tensor()->is_reduce_tensor() &&
                !x->as_tensor()->IsReduceInited(stages);
diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc
index 629b405dcd2f0..24d5325bc1be9 100644
--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -35,7 +35,7 @@ namespace lang {
 namespace detail {
 
 void CheckNoIslCallRemains(Expr* expr) {
-  auto isl_calls = ir::CollectIRNodes(*expr, [](const Expr* expr) {
+  auto isl_calls = ir::ir_utils::CollectIRNodes(*expr, [](const Expr* expr) {
     return expr->As<ir::Call>() && expr->As<ir::Call>()->is_isl_call();
   });
 #ifdef CINN_DEBUG
@@ -223,7 +223,7 @@ void CreateCompGraphWithInlineTensors(common::Graph* graph,
   // collect dependency tensors of t
   // here we just collect the tensors in Load nodes
   // NOTE there may be some other cases.
-  auto deps = ir::CollectLoadTensors(
+  auto deps = ir::ir_utils::CollectLoadTensors(
       t->body(), [](const Expr* x) { return x->as_tensor(); });
   for (const auto& dep : deps) {
     auto e_tensor = dep.as_tensor_ref();
@@ -342,7 +342,7 @@ std::vector<ir::Argument> LowerImpl::GenerateFunctionArgumentList(
   CheckArgsUnique();
 
   std::vector<ir::Argument> args;
-  auto teller = ir::CollectTensorNeedsWrite(&fn_body);
+  auto teller = ir::ir_utils::CollectTensorNeedsWrite(&fn_body);
 
   std::set<std::string> arg_names;
 
@@ -395,7 +395,7 @@ std::vector<ir::Argument> LowerImpl::GenFuncArgForSplitKernel(
 
   std::vector<ir::Argument> in_args;
   std::vector<ir::Argument> out_args;
-  auto teller = ir::CollectTensorNeedsWrite(&func_iterator);
+  auto teller = ir::ir_utils::CollectTensorNeedsWrite(&func_iterator);
   std::set<std::string> arg_names;
   std::set<std::string> all_tensor_names;
 
@@ -408,11 +408,12 @@ std::vector<ir::Argument> LowerImpl::GenFuncArgForSplitKernel(
     in_args.emplace_back(scalar, ir::Argument::IO::kInput);
   }
 
-  auto all_tensors = ir::CollectIRNodes(func_iterator, [&](const Expr* x) {
-    return x->as_tensor() && !stages_[x->as_tensor()]->inlined();
-  });
+  auto all_tensors =
+      ir::ir_utils::CollectIRNodes(func_iterator, [&](const Expr* x) {
+        return x->as_tensor() && !stages_[x->as_tensor()]->inlined();
+      });
 
-  auto all_vars = ir::CollectIRNodes(
+  auto all_vars = ir::ir_utils::CollectIRNodes(
       func_iterator, [&](const Expr* x) { return x->as_var(); });
 
   for (auto& i : all_tensors) {
@@ -588,7 +589,7 @@ std::vector<ir::LoweredFunc> LowerImpl::operator()() {
         Reference(&arg)->buffer = tensor_map.at(arg->name)->buffer;
       }
     }
-    auto store_exprs = ir::CollectIRNodes(
+    auto store_exprs = ir::ir_utils::CollectIRNodes(
         func_iterator, [](const Expr* x) { return x->As<ir::Store>(); });
     std::vector<ir::Tensor> new_temp_tensors;
     for (auto& expr : store_exprs) {
diff --git a/paddle/cinn/lang/lower_tensor_group.cc b/paddle/cinn/lang/lower_tensor_group.cc
index 200b608387560..0a802c0f0566d 100644
--- a/paddle/cinn/lang/lower_tensor_group.cc
+++ b/paddle/cinn/lang/lower_tensor_group.cc
@@ -88,7 +88,7 @@ std::vector<ir::LoweredFunc> LowerTensorGroup::operator()() {
     }
 
     // Some store tensors are also temp tensors;
-    auto store_exprs = ir::CollectIRNodes(
+    auto store_exprs = ir::ir_utils::CollectIRNodes(
         func_body, [](const Expr* x) { return x->As<ir::Store>(); });
     for (auto& expr : store_exprs) {
       auto* store_node = expr.As<ir::Store>();
@@ -146,7 +146,7 @@ std::vector<ir::LoweredFunc> LowerTensorGroup::operator()() {
 std::vector<ir::Argument> LowerTensorGroup::GenerateFunctionArgumentList(
     Expr fn_body) {
   std::vector<ir::Argument> args;
-  auto teller = ir::CollectTensorNeedsWrite(&fn_body);
+  auto teller = ir::ir_utils::CollectTensorNeedsWrite(&fn_body);
 
   std::set<std::string> arg_names;
 
diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc
index eb059a30ea26d..175689defbe36 100644
--- a/paddle/cinn/optim/buffer_assign.cc
+++ b/paddle/cinn/optim/buffer_assign.cc
@@ -73,7 +73,7 @@ std::map<std::string, ir::Tensor> InitialAssignBuffer(
 
   // unify all the tensor occurance with a global one, e.g. there are multiple
   // tensor B exists in the expression, replace them with a shared one.
-  ir::CollectIRNodes(*expr, [&](const Expr* x) -> bool {
+  ir::ir_utils::CollectIRNodes(*expr, [&](const Expr* x) -> bool {
     auto* t = x->as_tensor();
     if (t && !stages[t]->inlined()) {
       Reference(x) = Expr(all_tensor_map.at(t->name));
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
index 8dad52ab4d9bc..d4123729bc53f 100644
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -225,7 +225,7 @@ void ComputeInlineExpand(Expr *expr,
                          poly::StageMap stages,
                          std::map<std::string, ir::Tensor> *all_tensor_map) {
   // the inline tensors contained in the expression.
-  auto inline_tensors = ir::CollectIRNodes(*expr, [&](const Expr *x) {
+  auto inline_tensors = ir::ir_utils::CollectIRNodes(*expr, [&](const Expr *x) {
     return x->as_tensor() && stages[x->as_tensor()]->inlined();
   });
 
@@ -240,9 +240,10 @@ void ComputeInlineExpand(Expr *expr,
       TensorInlineExpandMutator(tensor->name, all_tensor_map, stages)(expr);
     }
 
-    inline_tensors = ir::CollectLoadTensors(*expr, [&](const Expr *x) {
-      return x->as_tensor() && stages[x->as_tensor()]->inlined();
-    });
+    inline_tensors =
+        ir::ir_utils::CollectLoadTensors(*expr, [&](const Expr *x) {
+          return x->as_tensor() && stages[x->as_tensor()]->inlined();
+        });
   }
 }
 
diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
index a4feec97626cb..bb546f694be9d 100644
--- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
@@ -36,9 +36,9 @@ struct EliminateBroadcastInForloop : public ir::IRMutator<Expr*> {
 
     auto* node = expr->As<ir::Store>();
 
-    auto broadcasts = ir::CollectIRNodes(node->value, [&](const Expr* expr) {
-      return expr->As<ir::Broadcast>();
-    });
+    auto broadcasts = ir::ir_utils::CollectIRNodes(
+        node->value,
+        [&](const Expr* expr) { return expr->As<ir::Broadcast>(); });
     std::vector<Expr> let_exprs;
 
     Var tmp;
@@ -79,7 +79,7 @@ struct EliminateBroadcastInForloop : public ir::IRMutator<Expr*> {
   }
 
   bool ContainsLoopVar(Expr expr, Var loop_var) {
-    return !ir::CollectIRNodes(expr, [&](const Expr* e) -> bool {
+    return !ir::ir_utils::CollectIRNodes(expr, [&](const Expr* e) -> bool {
               return e->As<ir::_Var_>() &&
                      e->As<ir::_Var_>()->name == loop_var->name;
             }).empty();
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index d12a5c9f2dab8..7b30f75bf9652 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -586,7 +586,7 @@ class ResizeBufferSizeVisitor : public ir::IRMutator<> {
 
   int BufferSize(ir::Expr indice) {
     auto copy = IRCopy(indice);
-    auto vars = ir::CollectIRNodesInOrder(
+    auto vars = ir::ir_utils::CollectIRNodesInOrder(
         copy, [](const ir::Expr *expr) { return expr->As<ir::_Var_>(); });
 
     int max_range = 1;
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 2f3a9b29a3567..357bafe79730a 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -129,7 +129,8 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
 
     // the iter val must appear in the last index
     if (indices.empty() ||
-        ir::CollectIRNodes(indices.back(), find_matched_var_fn).empty()) {
+        ir::ir_utils::CollectIRNodes(indices.back(), find_matched_var_fn)
+            .empty()) {
       VLOG(5) << "Loop var:" << iter_var_->name
               << " is not used in the last index";
       return false;
@@ -137,7 +138,8 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
 
     // the iter val can't appear in mulitple indices
     for (int i = 0; i < indices.size() - 1; ++i) {
-      auto repeat_found = ir::CollectIRNodes(indices[i], find_matched_var_fn);
+      auto repeat_found =
+          ir::ir_utils::CollectIRNodes(indices[i], find_matched_var_fn);
       if (!repeat_found.empty()) {
         VLOG(5) << "Loop var:" << iter_var_->name
                 << " is used at more than last index, current:" << i;
@@ -214,7 +216,7 @@ class CudaVectorizer : public IRMutator<Expr *> {
   }
 
   void Visit(Expr *expr) {
-    write_teller_ = ir::CollectTensorNeedsWrite(expr);
+    write_teller_ = ir::ir_utils::CollectTensorNeedsWrite(expr);
     vectorized_teller_.Collect(expr);
     IRMutator<Expr *>::Visit(expr, expr);
   }
diff --git a/paddle/cinn/poly/domain.cc b/paddle/cinn/poly/domain.cc
index 309fa5aaa3db4..257de52fe7a5b 100644
--- a/paddle/cinn/poly/domain.cc
+++ b/paddle/cinn/poly/domain.cc
@@ -70,8 +70,8 @@ void Domain::ExtractParams() {
   std::unordered_set<std::string> var_names;
   auto collect_param_fn = [&](Expr& e) {
     if (!e.is_constant()) {
-      auto vars =
-          ir::CollectIRNodes(e, [](const Expr* e) { return e->is_var(); });
+      auto vars = ir::ir_utils::CollectIRNodes(
+          e, [](const Expr* e) { return e->is_var(); });
       for (auto& var : vars) var_names.insert(var.As<ir::_Var_>()->name);
     }
   };
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index faa7a99c0cfde..e2e5dc531c0f7 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -805,7 +805,7 @@ void Stage::SimpleComputeAt(Stage *other, int level) {
   compute_ats_[other->id()] = relation;
   auto other_expr = other->expr();
   auto find_tensors =
-      ir::CollectIRNodesWithoutTensor(other_expr, [&](const Expr *x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(other_expr, [&](const Expr *x) {
         return x->as_tensor() && x->as_tensor_ref()->name == tensor()->name;
       });
   if (!find_tensors.empty()) {
@@ -1025,7 +1025,7 @@ Iterator Stage::Fuse(const Iterator &level0, const Iterator &level1) {
 std::vector<std::string> Stage::input_statements() const {
   if (!expr_.defined()) return {};
   VLOG(3) << "stage " << id() << " expr: " << expr_;
-  auto load_exprs = ir::CollectIRNodes(
+  auto load_exprs = ir::ir_utils::CollectIRNodes(
       expr_, [](const Expr *x) { return x->As<ir::Load>(); });
   std::set<std::string> statements;
   for (auto &expr : load_exprs) {
@@ -1563,10 +1563,11 @@ void Stage::ShareBufferWith(Stage *other) {
 isl_map *__isl_give GatherAccesses(Stage *stage,
                                    const std::string &tensor_name) {
   CHECK(stage->tensor_);
-  auto loads = ir::CollectIRNodes(stage->tensor_->body(), [&](const Expr *x) {
-    return x->As<ir::Load>() &&
-           x->As<ir::Load>()->tensor.as_tensor()->name == tensor_name;
-  });
+  auto loads =
+      ir::ir_utils::CollectIRNodes(stage->tensor_->body(), [&](const Expr *x) {
+        return x->As<ir::Load>() &&
+               x->As<ir::Load>()->tensor.as_tensor()->name == tensor_name;
+      });
 
   auto vars = stage->tensor_->axis_with_reduce();
 
@@ -1888,7 +1889,7 @@ StageMap CreateStages(const std::vector<ir::Tensor> &tensors) {
   std::set<ir::Tensor> all_tensors(tensors.begin(), tensors.end());
 
   for (auto &tensor : tensors) {
-    auto used_tensors = ir::CollectIRNodes(
+    auto used_tensors = ir::ir_utils::CollectIRNodes(
         tensor->body(), [](const Expr *x) { return x->as_tensor(); });
     for (const Expr &x : used_tensors) {
       all_tensors.insert(x.as_tensor_ref());

From 27d0fed793ac229645371d4b34c1a6c3970a02c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Thu, 21 Sep 2023 14:51:47 +0800
Subject: [PATCH 034/115] move ir_verify from namespace optim to ir_utils
 (#57532)

---
 paddle/cinn/backends/codegen_c.cc         |  2 +-
 paddle/cinn/backends/codegen_cuda_dev.cc  |  2 +-
 paddle/cinn/backends/llvm/codegen_llvm.cc |  2 +-
 paddle/cinn/ir/test/ir_verify_test.cc     | 10 ++++++----
 paddle/cinn/ir/utils/ir_verify.cc         | 12 ++++++++----
 paddle/cinn/ir/utils/ir_verify.h          |  9 ++++++---
 6 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
index 2345bf53d36cd..6440339947682 100644
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -38,7 +38,7 @@ using cinn::common::float16;
 const char *kCKeywordRestrict = "__restrict__";
 
 void CodeGenC::Compile(const ir::Module &module, const Outputs &outputs) {
-  ir::IrVerify(Expr(module));
+  ir::ir_utils::IrVerify(Expr(module));
 
   if (!outputs.c_header_name.empty()) {
     auto source = Compile(module, OutputKind::CHeader);
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 1f6f5bba154aa..5a1ddbc450a09 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -56,7 +56,7 @@ std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, bool for_nvrtc) {
 
 void CodeGenCUDA_Dev::Compile(const ir::Module &module,
                               const Outputs &outputs) {
-  ir::IrVerify(Expr(module));
+  ir::ir_utils::IrVerify(Expr(module));
 
   CodeGenC::inline_builtin_codes_ = false;
   if (!outputs.c_header_name.empty()) {
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index 5ff8ce03c77b0..b91772bd688b8 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -790,7 +790,7 @@ llvm::Value *CodeGenLLVM::Visit(const ir::Call *op) {
 llvm::Value *CodeGenLLVM::Visit(const ir::_Module_ *op) {
   {
     Expr body_to_verify(&Reference(op));
-    ir::IrVerify(body_to_verify);
+    ir::ir_utils::IrVerify(body_to_verify);
   }
 
   for (auto &fn : op->functions) {
diff --git a/paddle/cinn/ir/test/ir_verify_test.cc b/paddle/cinn/ir/test/ir_verify_test.cc
index 06a842ef5ba81..183f20e491fbc 100644
--- a/paddle/cinn/ir/test/ir_verify_test.cc
+++ b/paddle/cinn/ir/test/ir_verify_test.cc
@@ -18,12 +18,14 @@
 
 #include "paddle/cinn/ir/op/ir_operators.h"
 
-namespace cinn::ir {
-
+namespace cinn {
+namespace ir {
+namespace ir_utils {
 TEST(IrVerify, basic) {
   Expr a(1);
   Expr b(1);
   IrVerify(a + b);
 }
-
-}  // namespace cinn::ir
+}  // namespace ir_utils
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_verify.cc b/paddle/cinn/ir/utils/ir_verify.cc
index d0f69802438bb..b961e25114249 100644
--- a/paddle/cinn/ir/utils/ir_verify.cc
+++ b/paddle/cinn/ir/utils/ir_verify.cc
@@ -17,7 +17,10 @@
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 
-namespace cinn::ir {
+namespace cinn {
+namespace ir {
+namespace ir_utils {
+namespace {
 
 struct IrVerifyVisitor : public ir::IRMutator<> {
   using ir::IRMutator<>::Visit;
@@ -30,10 +33,11 @@ struct IrVerifyVisitor : public ir::IRMutator<> {
   NODETY_FORALL(__)
 #undef __
 };
-
+}  // namespace
 void IrVerify(Expr e) {
   IrVerifyVisitor visitor;
   visitor.Visit(&e, &e);
 }
-
-}  // namespace cinn::ir
+}  // namespace ir_utils
+}  // namespace ir
+}  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_verify.h b/paddle/cinn/ir/utils/ir_verify.h
index deddb3178282d..d47c97e0197d4 100644
--- a/paddle/cinn/ir/utils/ir_verify.h
+++ b/paddle/cinn/ir/utils/ir_verify.h
@@ -15,8 +15,11 @@
 #pragma once
 #include "paddle/cinn/ir/ir.h"
 
-namespace cinn::ir {
+namespace cinn {
+namespace ir {
+namespace ir_utils {
 
 void IrVerify(Expr e);
-
-}  // namespace cinn::ir
+}  // namespace ir_utils
+}  // namespace ir
+}  // namespace cinn

From 98be3d95e2041938fa7e783a07ec5cee56251f38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Thu, 21 Sep 2023 14:52:09 +0800
Subject: [PATCH 035/115] =?UTF-8?q?=E3=80=90CINN=E3=80=91move=20ir=5Frepla?=
 =?UTF-8?q?ce=20from=20cinn/optim=20to=20cinn/ir/utils=20(#57524)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* move ir_replace from cinn/optim to cinn/ir/utils

* delete extra modification
---
 paddle/cinn/ir/utils/CMakeLists.txt              |  3 ++-
 paddle/cinn/{optim => ir/utils}/ir_replace.cc    |  8 +++++---
 paddle/cinn/{optim => ir/utils}/ir_replace.h     |  7 ++++---
 paddle/cinn/optim/CMakeLists.txt                 |  1 -
 paddle/cinn/optim/buffer_assign.cc               |  2 +-
 .../cinn/optim/eliminate_broadcast_in_forloop.cc |  4 ++--
 paddle/cinn/optim/unroll_loops.cc                |  4 ++--
 paddle/cinn/optim/vectorize_loops.cc             | 16 +++++++++-------
 paddle/cinn/poly/stage.cc                        |  2 +-
 9 files changed, 26 insertions(+), 21 deletions(-)
 rename paddle/cinn/{optim => ir/utils}/ir_replace.cc (93%)
 rename paddle/cinn/{optim => ir/utils}/ir_replace.h (91%)

diff --git a/paddle/cinn/ir/utils/CMakeLists.txt b/paddle/cinn/ir/utils/CMakeLists.txt
index 5613bf7260155..032bf537d2fce 100644
--- a/paddle/cinn/ir/utils/CMakeLists.txt
+++ b/paddle/cinn/ir/utils/CMakeLists.txt
@@ -9,4 +9,5 @@ gather_srcs(
   ir_verify.cc
   ir_compare.cc
   ir_nodes_collector.cc
-  ir_copy.cc)
+  ir_copy.cc
+  ir_replace.cc)
diff --git a/paddle/cinn/optim/ir_replace.cc b/paddle/cinn/ir/utils/ir_replace.cc
similarity index 93%
rename from paddle/cinn/optim/ir_replace.cc
rename to paddle/cinn/ir/utils/ir_replace.cc
index 3dc39a08a3817..da2305359c5e9 100644
--- a/paddle/cinn/optim/ir_replace.cc
+++ b/paddle/cinn/ir/utils/ir_replace.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/optim/ir_replace.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 
 #include <set>
 
@@ -22,7 +22,8 @@
 #include "paddle/cinn/utils/string.h"
 
 namespace cinn {
-namespace optim {
+namespace ir {
+namespace ir_utils {
 using utils::GetStreamCnt;
 
 namespace {
@@ -65,5 +66,6 @@ void IrReplace(ir::Expr* expr, ir::Expr from, ir::Expr to) {
   IrReplaceMutator(from, to)(expr);
 }
 
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/optim/ir_replace.h b/paddle/cinn/ir/utils/ir_replace.h
similarity index 91%
rename from paddle/cinn/optim/ir_replace.h
rename to paddle/cinn/ir/utils/ir_replace.h
index 7c95d1e6f6c38..312e4c61eff0a 100644
--- a/paddle/cinn/optim/ir_replace.h
+++ b/paddle/cinn/ir/utils/ir_replace.h
@@ -18,10 +18,11 @@
 #include "paddle/cinn/ir/ir.h"
 
 namespace cinn {
-namespace optim {
+namespace ir {
+namespace ir_utils {
 
 //! Replace the variable \p v to expression \p e in expression \p expr.
 void IrReplace(ir::Expr* expr, ir::Expr from, ir::Expr to);
-
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
index 1b4a55479ef0b..03b8c95b74173 100755
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -4,7 +4,6 @@ gather_srcs(
   cinnapi_src
   SRCS
   replace_call_with_expr.cc
-  ir_replace.cc
   replace_var_with_expr.cc
   ir_simplify.cc
   optimize.cc
diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc
index 175689defbe36..f749cac9ba502 100644
--- a/paddle/cinn/optim/buffer_assign.cc
+++ b/paddle/cinn/optim/buffer_assign.cc
@@ -17,8 +17,8 @@
 #include "paddle/cinn/common/union_find.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 #include "paddle/cinn/lang/lower_impl.h"
-#include "paddle/cinn/optim/ir_replace.h"
 
 namespace cinn {
 namespace optim {
diff --git a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
index bb546f694be9d..e836563a9feb0 100644
--- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
@@ -19,8 +19,8 @@
 
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
-#include "paddle/cinn/optim/ir_replace.h"
 
 namespace cinn {
 namespace optim {
@@ -54,7 +54,7 @@ struct EliminateBroadcastInForloop : public ir::IRMutator<Expr*> {
       std::tie(let_expr, tmp) = CreateTmpLet(broadcast);
       let_exprs.push_back(let_expr);
 
-      optim::IrReplace(expr, broadcast, tmp);
+      cinn::ir::ir_utils::IrReplace(expr, broadcast, tmp);
     }
 
     // insert the let expressions to the outer forloop.
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
index fc5fab85eca5f..32d4037b83e3e 100644
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -21,7 +21,7 @@
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_replace.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 
 namespace cinn {
 namespace optim {
@@ -95,7 +95,7 @@ struct UnrollMutator : public ir::IRMutator<Expr*> {
     for (int i = min->value; i < extent->value; i++) {
       Expr start = op->min + i;
       body.push_back(optim::IRCopy(op->body));
-      optim::IrReplace(&body.back(), op->loop_var, start);
+      cinn::ir::ir_utils::IrReplace(&body.back(), op->loop_var, start);
     }
 
     *expr = ir::Block::Make(body);
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 357bafe79730a..8ed13e9d5971b 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -29,7 +29,7 @@
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_replace.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/unroll_loops.h"
 #include "paddle/cinn/utils/functional.h"
@@ -149,11 +149,11 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
 
     // check tensor accessed sequentially by comparing index one by one
     Expr first_idx = optim::IRCopy(indices.back());
-    optim::IrReplace(&first_idx, Expr(iter_var_), Expr(0));
+    cinn::ir::ir_utils::IrReplace(&first_idx, Expr(iter_var_), Expr(0));
     const auto &interval = var_intervals_->at(iter_var_->name);
     for (int i = 1; i < interval.r; ++i) {
       Expr next_idx = optim::IRCopy(indices.back());
-      optim::IrReplace(&next_idx, Expr(iter_var_), Expr(i));
+      cinn::ir::ir_utils::IrReplace(&next_idx, Expr(iter_var_), Expr(i));
       auto gap = common::AutoSimplify(Expr(next_idx - first_idx));
       if (!gap.As<IntImm>() || gap.as_int32() != i) {
         VLOG(5) << "Tensor:" << tensor->name
@@ -310,7 +310,8 @@ class CudaVectorizer : public IRMutator<Expr *> {
 
     // generate a get_addr expr to get the address of the tensor
     Expr converted_tensor = Load::Make(tensor, indices);
-    optim::IrReplace(&converted_tensor, iter_var_, Expr(int32_t(0)));
+    cinn::ir::ir_utils::IrReplace(
+        &converted_tensor, iter_var_, Expr(int32_t(0)));
     auto get_addr = ir::intrinsics::GetAddr::Make(converted_tensor);
 
     // generate a let expression to cast the tensor into the local vector
@@ -888,7 +889,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
                                                     ForType::Serial,
                                                     DeviceAPI::UNK,
                                                     IRCopy(inner_for->body))});
-          optim::IrReplace(
+          cinn::ir::ir_utils::IrReplace(
               &inner_for_b, inner_for->loop_var, Expr(new_iterator_inner));
 
           Expr out_for_b = For::Make(new_iterator_outer,
@@ -898,7 +899,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
                                      outer_for->device_api,
                                      inner_for_b,
                                      outer_for->vectorize_info());
-          optim::IrReplace(
+          cinn::ir::ir_utils::IrReplace(
               &out_for_b, outer_for->loop_var, Expr(new_iterator_outer));
           *expr = Block::Make({out_for_a, out_for_b});
           VLOG(2) << *expr;
@@ -960,7 +961,8 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
       } else {
         new_index = Expr(forloop->loop_var) * factor + Expr(new_iterator);
       }
-      optim::IrReplace(&forloop->body, forloop->loop_var, new_index);
+      cinn::ir::ir_utils::IrReplace(
+          &forloop->body, forloop->loop_var, new_index);
       auto new_forloop = For::Make(new_iterator,
                                    forloop->min,
                                    make_const(factor),
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index e2e5dc531c0f7..d74bce1404e5b 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -28,9 +28,9 @@
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/ir/utils/ir_replace.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
 #include "paddle/cinn/lang/compute.h"
-#include "paddle/cinn/optim/ir_replace.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 #include "paddle/cinn/poly/compute_at_transform.h"

From 55b7523779bbbed757c4e5b8294e12df64f79af5 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Thu, 21 Sep 2023 15:36:50 +0800
Subject: [PATCH 036/115] [clang-tidy] NO.23 bugprone-branch-clone (#57522)

* clangtidyNo23

* fix

* fix
---
 .clang-tidy                                       |  2 +-
 .../collective/processgroup_comm_utils.cc         |  2 +-
 paddle/fluid/framework/details/fetch_op_handle.cc |  2 +-
 paddle/fluid/framework/downpour_worker.cc         |  5 ++---
 paddle/fluid/framework/executor_cache.cc          |  2 +-
 paddle/fluid/framework/io/fs.cc                   |  9 ++++-----
 .../fluid/framework/ir/constant_folding_pass.cc   |  4 +---
 .../ir/mkldnn/quant_dequant_mkldnn_pass.cc        |  5 ++---
 .../garbage_collector/event_garbage_collector.cc  |  7 ++++---
 .../garbage_collector/fast_garbage_collector.cc   |  7 ++++---
 .../new_executor/interpreter/static_build.cc      |  6 ++----
 .../framework/new_executor/new_ir_interpreter.cc  |  7 ++++---
 .../framework/new_executor/program_interpreter.cc |  7 ++++---
 paddle/fluid/framework/operator.cc                |  9 ++-------
 paddle/fluid/framework/parallel_executor.cc       | 10 ++++------
 paddle/fluid/framework/tensor_util.cc             |  6 ++++--
 paddle/fluid/framework/var_desc.cc                |  7 ++-----
 paddle/fluid/inference/api/analysis_predictor.cc  |  4 ++--
 paddle/fluid/memory/memcpy.cc                     |  2 +-
 paddle/fluid/operators/batch_norm_op.cc           |  4 ----
 paddle/fluid/operators/data_norm_op.cc            |  2 --
 .../operators/detection/multiclass_nms_op.cc      | 15 ++-------------
 .../operators/fused/fused_bn_activation_op.cc     |  2 --
 .../operators/fused/fused_bn_add_activation_op.cc |  2 --
 .../fused/mkldnn/fusion_gru_mkldnn_op.cc          |  2 +-
 .../fused/mkldnn/fusion_lstm_mkldnn_op.cc         |  2 +-
 .../operators/fused/mkldnn/multi_gru_mkldnn_op.cc |  4 ++--
 paddle/fluid/operators/inplace_abn_op.cc          |  2 --
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc     |  2 +-
 .../fluid/operators/mkldnn/reshape_mkldnn_op.cc   |  2 --
 paddle/fluid/operators/reader/buffered_reader.cc  |  6 ++----
 paddle/fluid/operators/sum_op.cc                  |  2 +-
 .../pir/phi_kernel_adaptor/phi_kernel_util.cc     |  5 ++---
 paddle/fluid/platform/place.cc                    |  6 +-----
 .../fluid/prim/api/manual_prim/static_prim_api.cc |  2 --
 paddle/fluid/pybind/eager_method.cc               |  6 ++----
 paddle/fluid/pybind/eager_properties.cc           |  6 ++----
 paddle/fluid/pybind/eager_utils.cc                |  9 +++------
 paddle/fluid/pybind/inference_api.cc              |  2 +-
 paddle/fluid/pybind/op_function_common.cc         |  4 +---
 paddle/phi/core/compat/convert_utils.cc           |  2 +-
 paddle/phi/core/kernel_factory.cc                 |  5 ++---
 paddle/phi/infermeta/unary.cc                     |  2 +-
 paddle/phi/kernels/cpu/diagonal_grad_kernel.cc    |  6 ++----
 .../phi/kernels/cpu/generate_proposals_kernel.cc  |  8 +-------
 .../phi/kernels/cpu/send_ue_recv_grad_kernel.cc   |  4 ++--
 paddle/phi/kernels/funcs/vol2col.cc               |  4 ++--
 47 files changed, 79 insertions(+), 142 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 6a6700c192027..924095b4def28 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -4,7 +4,7 @@ bugprone-argument-comment,
 -bugprone-assert-side-effect,
 -bugprone-bad-signal-to-kill-thread,
 -bugprone-bool-pointer-implicit-conversion,
--bugprone-branch-clone,
+bugprone-branch-clone,
 bugprone-copy-constructor-init,
 -bugprone-dangling-handle,
 -bugprone-dynamic-static-initializers,
diff --git a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
index 94723906fccb1..eec697f523945 100644
--- a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
+++ b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
@@ -51,7 +51,7 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
 #else
     return nullptr;
 #endif
-  } else if (place.GetType() == phi::AllocationType::CUSTOM) {
+  } else if (place.GetType() == phi::AllocationType::CUSTOM) {  // NOLINT
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
     return static_cast<paddle::distributed::ProcessGroupCustom*>(pg)->XCCLComm(
         place);
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 2a504b2a0fc2b..b71c476a2c95e 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -120,7 +120,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const {
 static void TransData(const phi::DenseTensor &src_item,
                       phi::DenseTensor *dst_item) {
   if (src_item.IsInitialized() && src_item.numel() > 0) {
-    if (platform::is_gpu_place(src_item.place())) {
+    if (platform::is_gpu_place(src_item.place())) {  // NOLINT
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 8a0406864cde7..e69a25bb32781 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -362,9 +362,8 @@ void DownpourWorker::CopySparseTable() {
     if (src_table == dest_table) {
       continue;
     } else if (!copy_table_config_.sparse_copy_by_feasign()) {
-      if (feasign_set_.find(src_table) == feasign_set_.end()) {
-        continue;
-      } else if (feasign_set_[src_table].empty()) {
+      if (feasign_set_.find(src_table) == feasign_set_.end() ||
+          feasign_set_[src_table].empty()) {
         continue;
       }
       feanum = fleet_ptr_->CopyTable(src_table, dest_table);
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 64d5ce24d20fe..5613a8dbf155e 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -47,7 +47,7 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
       execution_strategy.num_threads_ = 2;
       break;
     }
-    case platform::DeviceType::CUDA: {
+    case platform::DeviceType::CUDA: {  // NOLINT
       // NOTE: According experiments, one thread is faster in
       // most model training.
       execution_strategy.num_threads_ = 1;
diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc
index a39147a97cf7e..4a689409d412b 100644
--- a/paddle/fluid/framework/io/fs.cc
+++ b/paddle/fluid/framework/io/fs.cc
@@ -399,13 +399,12 @@ void hdfs_mv(const std::string& src, const std::string& dest) {
 }
 
 int fs_select_internal(const std::string& path) {
-  if (fs_begin_with_internal(path, "hdfs:")) {
-    return 1;
-  } else if (fs_begin_with_internal(path, "afs:")) {
+  if (fs_begin_with_internal(path, "hdfs:") ||
+      fs_begin_with_internal(path, "afs:")) {
     return 1;
+  } else {
+    return 0;
   }
-
-  return 0;
 }
 
 std::shared_ptr<FILE> fs_open_read(const std::string& path,
diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc
index 3b3f23933fb6d..f8e0ac9475b5d 100644
--- a/paddle/fluid/framework/ir/constant_folding_pass.cc
+++ b/paddle/fluid/framework/ir/constant_folding_pass.cc
@@ -81,9 +81,7 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const {
     std::unordered_map<std::string, int> map;
     for (auto in_node : op_node->inputs) {
       map[in_node->Name()] = 0;
-      if (!in_node->Var()->Persistable()) {
-        input_persis = false;
-      } else if (!in_node->inputs.empty()) {
+      if (!in_node->Var()->Persistable() || !in_node->inputs.empty()) {
         input_persis = false;
       }
     }
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 8f19225dc53b4..655183dc712c0 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -400,9 +400,8 @@ void QuantDequantMkldnnPass::RemoveFakeOps(
 
     if (fake_quantize_types.count(op_node->Name())) {
       CollectFakeQuantizeOps(graph, op_node, &nodes2rm);
-    } else if (fake_dequantize_types.count(op_node->Name())) {
-      CollectFakeDequantizeOps(graph, op_node, &nodes2rm);
-    } else if (fake_quantize_dequantize_types.count(op_node->Name())) {
+    } else if (fake_dequantize_types.count(op_node->Name()) ||
+               fake_quantize_dequantize_types.count(op_node->Name())) {
       CollectFakeDequantizeOps(graph, op_node, &nodes2rm);
     } else if (onnx_format_quantize_dequantize_types.count(op_node->Name())) {
       CollectQuantizeDequantizeOpsFromONNXFormat(graph, op_node, &nodes2rm);
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
index e826c94712568..e63164c020c36 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
@@ -88,9 +88,10 @@ void InterpreterCoreEventGarbageCollector::Add(
 
   if (var->IsType<phi::DenseTensor>()) {
     Add(var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder(), event, ctx);
-  } else if (var->IsType<
-                 operators::reader::
-                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+  } else if (
+      var->IsType<
+          operators::reader::
+              OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
     // TODO(xiongkun03) in old executor, this type of variable is not support
     // eager deletion. so we just leave it here ?
   } else if (var->IsType<LoDRankTable>()) {
diff --git a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
index 4bc8b298012ab..e7efc1f10c324 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.cc
@@ -34,9 +34,10 @@ void InterpreterCoreFastGarbageCollector::Add(Variable* var) {
 
   if (var->IsType<phi::DenseTensor>()) {
     Add(var->GetMutable<phi::DenseTensor>()->MoveMemoryHolder());
-  } else if (var->IsType<
-                 operators::reader::
-                     OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+  } else if (
+      var->IsType<
+          operators::reader::
+              OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
     // TODO(xiongkun03) in old executor, this type of variable is not support
     // eager deletion. so we just leave it here ?
   } else if (var->IsType<LoDRankTable>()) {
diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
index 0f9bd3f387a92..67b75bb523711 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -267,10 +267,8 @@ phi::TensorBase* GetTensorFormVar(framework::Variable* var) {
       return var->template GetMutable<phi::TensorArray>();
     } else if (var->template IsType<framework::Strings>()) {
       return var->template GetMutable<framework::Strings>();
-    } else if (var->template IsType<paddle::framework::RawTensor>()) {
-      return var->template GetMutable<paddle::framework::RawTensor>();
-    } else if (!var->IsInitialized()) {
-      // The following is for RAW type of var
+    } else if (var->template IsType<paddle::framework::RawTensor>() ||
+               !var->IsInitialized()) {
       return var->template GetMutable<paddle::framework::RawTensor>();
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 47823eb82b428..2dc6181180c9d 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -758,9 +758,10 @@ void NewIRInterpreter::RecordStreamForGC(InstructionBase* instr) {
 
     if (var->IsType<phi::DenseTensor>()) {
       TensorRecordStream(*(var->GetMutable<phi::DenseTensor>()));
-    } else if (var->IsType<
-                   operators::reader::
-                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+    } else if (
+        var->IsType<
+            operators::reader::
+                OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
       // do nothing
     } else if (var->IsType<phi::SelectedRows>()) {
       TensorRecordStream(
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 1384a9fb487de..2e466962c4d31 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -1292,9 +1292,10 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
 
     if (var->IsType<phi::DenseTensor>()) {
       TensorRecordStream(*(var->GetMutable<phi::DenseTensor>()));
-    } else if (var->IsType<
-                   operators::reader::
-                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
+    } else if (
+        var->IsType<
+            operators::reader::
+                OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {  // NOLINT
       // do nothing
     } else if (var->IsType<phi::SelectedRows>()) {
       TensorRecordStream(
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 9b9979bc70f4c..7a3271a48debc 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2777,8 +2777,6 @@ void OperatorWithKernel::ParseInputDataType(
     const phi::DenseTensor* t = nullptr;
     if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
-    } else if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
     } else if (var->IsType<phi::SelectedRows>()) {
       t = &(var->Get<phi::SelectedRows>().value());
     } else if (var->IsType<phi::SparseCooTensor>()) {
@@ -3221,11 +3219,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
         } else if (var->template IsType<framework::Strings>()) {
           tensor_out = var->template GetMutable<framework::Strings>();
           phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
-        } else if (var->template IsType<paddle::framework::RawTensor>()) {
-          tensor_out = var->template GetMutable<paddle::framework::RawTensor>();
-          phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
-        } else if (!var->IsInitialized()) {
-          // The following is for RAW type of var
+        } else if (var->template IsType<paddle::framework::RawTensor>() ||
+                   !var->IsInitialized()) {
           tensor_out = var->template GetMutable<paddle::framework::RawTensor>();
           phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8b6363d93d134..e6c11df275b56 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -693,7 +693,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
 
   // broadcast parameters from the 0th device to others:
   auto need_broadcast = [&]() -> bool {
-    if (member_->build_strategy_.num_trainers_ > 1) {
+    if (member_->build_strategy_.num_trainers_ > 1) {  // NOLINT
       // 1. num_tariners would be grater than 1 for nccl distributed training.
       return true;
     } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
@@ -936,11 +936,9 @@ void ParallelExecutor::BCastParamsToDevices(
         auto share_memory = [&] { t->ShareDataWith(main_tensor); };
 
         // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
-        if (member_->build_strategy_.async_mode_) {
-          share_memory();
-        } else if (member_->use_all_reduce_ ||
-                   member_->IsUseCUDA(member_->use_device_) ||
-                   var == "@LR_DECAY_COUNTER@") {
+        if (member_->use_all_reduce_ ||
+            member_->IsUseCUDA(member_->use_device_) ||
+            var == "@LR_DECAY_COUNTER@") {
           copy_memory();
         } else {
           share_memory();
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 6fe75d1a90dab..90612e5692595 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -78,7 +78,8 @@ void TensorCopyImpl(const TENSOR& src,
   auto size = src.numel() * phi::SizeOf(src.dtype());
 #endif
 
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+  if (platform::is_cpu_place(src_place) &&
+      platform::is_cpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -327,7 +328,8 @@ void TensorCopySync(const phi::DenseTensor& src,
     return;
   }
   auto size = src.numel() * phi::SizeOf(src.dtype());
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+  if (platform::is_cpu_place(src_place) &&
+      platform::is_cpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index b0130e055c075..836ba0fb762b3 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -386,11 +386,8 @@ struct SetVarAttrDescVisitor {
   template <typename T>
   void operator()(T &&v) {
     using U = std::decay_t<decltype(v)>;
-    if (std::is_same<U, int>::value) {
-      set_attr_value(v);
-    } else if (std::is_same<U, std::string>::value) {
-      set_attr_value(v);
-    } else if (std::is_same<U, std::vector<int>>::value) {
+    if (std::is_same<U, int>::value || std::is_same<U, std::string>::value ||
+        std::is_same<U, std::vector<int>>::value) {
       set_attr_value(v);
     } else {
       PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6b57f1fabf4bd..70da22a3240e9 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2006,7 +2006,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
       static_cast<void *>(scope), this->GetDeviceContexts()));
   res->input_or_output_ = true;
   res->SetName(name);
-  if (platform::is_cpu_place(place_)) {
+  if (platform::is_cpu_place(place_)) {  // NOLINT
     res->SetPlace(PaddlePlace::kCPU);
   } else if (platform::is_ipu_place(place_)) {
     // Currently, IPUPlace's tensor copy between cpu and ipu has been set in
@@ -2057,7 +2057,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
       static_cast<void *>(scope), this->GetDeviceContexts()));
   res->input_or_output_ = false;
   res->SetName(name);
-  if (platform::is_cpu_place(place_)) {
+  if (platform::is_cpu_place(place_)) {  // NOLINT
     res->SetPlace(PaddlePlace::kCPU);
   } else if (platform::is_ipu_place(place_)) {
     // Currently, IPUPlace's tensor copy between cpu and ipu has been set in
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 656d6273afb3f..cf253d6c4ebdc 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -743,7 +743,7 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (src_place.GetType() == phi::AllocationType::CPU &&
-      dst_place.GetType() == phi::AllocationType::CPU) {
+      dst_place.GetType() == phi::AllocationType::CPU) {  // NOLINT
     std::memcpy(dst, src, num);
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 4f1c7ab3857d7..1d45cee715409 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -386,8 +386,6 @@ phi::KernelKey BatchNormGradOp::GetExpectedKernelType(
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::DenseTensor>()) {
-    t = &var->Get<phi::DenseTensor>();
   }
   if (t == nullptr) {
     PADDLE_THROW(
@@ -530,8 +528,6 @@ phi::KernelKey BatchNormDoubleGradOp::GetExpectedKernelType(
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::DenseTensor>()) {
-    t = &var->Get<phi::DenseTensor>();
   }
   if (t == nullptr) {
     PADDLE_THROW(
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 493351654d5eb..2e70168876162 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -495,8 +495,6 @@ class DataNormGradOp : public framework::OperatorWithKernel {
     const phi::DenseTensor *t = nullptr;
     if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
-    } else if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
     }
     if (t == nullptr) {
       PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 432713c60d969..8519752bc1049 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -101,11 +101,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel {
     }
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
-    if (score_size == 3) {
-      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
-    } else {
-      ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
-    }
+    ctx->SetOutputDim("Out", {-1, box_dims[2] + 2});
     if (!ctx->IsRuntime()) {
       ctx->SetLoDLevel("Out", std::max(ctx->GetLoDLevel("BBoxes"), 1));
     }
@@ -584,14 +580,7 @@ class MultiClassNMS2Op : public MultiClassNMSOp {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     MultiClassNMSOp::InferShape(ctx);
-
-    auto score_dims = ctx->GetInputDim("Scores");
-    auto score_size = score_dims.size();
-    if (score_size == 3) {
-      ctx->SetOutputDim("Index", {-1, 1});
-    } else {
-      ctx->SetOutputDim("Index", {-1, 1});
-    }
+    ctx->SetOutputDim("Index", {-1, 1});
     if (!ctx->IsRuntime()) {
       ctx->SetLoDLevel("Index", std::max(ctx->GetLoDLevel("BBoxes"), 1));
     }
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
index 88b11f1ef39c5..ca59a466a5c2b 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc
@@ -303,8 +303,6 @@ phi::KernelKey FusedBatchNormActGradOp::GetExpectedKernelType(
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::DenseTensor>()) {
-    t = &var->Get<phi::DenseTensor>();
   }
   if (t == nullptr) {
     PADDLE_THROW(
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index a33a91b082e5c..ed416d4ad13d1 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -267,8 +267,6 @@ phi::KernelKey FusedBatchNormAddActGradOp::GetExpectedKernelType(
   const phi::DenseTensor *t = nullptr;
   if (var->IsType<phi::DenseTensor>()) {
     t = &var->Get<phi::DenseTensor>();
-  } else if (var->IsType<phi::DenseTensor>()) {
-    t = &var->Get<phi::DenseTensor>();
   }
   if (t == nullptr) {
     PADDLE_THROW(
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 05d1e64f92ae7..5ec5e8081bb6f 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -248,7 +248,7 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
     const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
     // BF16 does not support force output
-    if (!is_bf16 && force_fp32_output) {
+    if (!is_bf16 && force_fp32_output) {  // NOLINT
       RunKernel<float>(ctx);
     } else {
       RunKernel<T>(ctx);
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index d973c5e89a626..4972db5804322 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -329,7 +329,7 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
     const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
 
     // BF16 does not support force output
-    if (!is_bf16 && force_fp32_output) {
+    if (!is_bf16 && force_fp32_output) {  // NOLINT
       RunKernel<float>(ctx);
     } else {
       RunKernel<T>(ctx);
diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
index 90ecbe4506d98..1c8e0a1b56a97 100644
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -688,7 +688,7 @@ class MultiGRUMKLDNNKernel : public framework::OpKernel<T> {
     const bool force_fp32_output =
         ctx.HasAttr("force_fp32_output") && ctx.Attr<bool>("force_fp32_output");
 
-    if (force_fp32_output) {
+    if (force_fp32_output) {  // NOLINT
       RunKernel<float>(ctx);
     } else {
       RunKernel<T>(ctx);
@@ -706,7 +706,7 @@ class MultiGRUMKLDNNKernel : public framework::OpKernel<T> {
       auto gru_out_L2R = handler.executeSingleGru(input_mem, layer, L2R);
       handler.reorderInputL2RtoR2L(input_mem, layer);
       auto gru_out_R2L = handler.executeSingleGru(input_mem, layer, R2L);
-      if (layer < layers - 1)
+      if (layer < layers - 1)  // NOLINT
         handler.template reorderOutputR2LtoL2R<T>(gru_out_R2L, layer);
       else
         handler.template reorderOutputR2LtoL2R<Tout>(gru_out_R2L, layer);
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index eee0f1f304bc3..a53a9867b9903 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -284,8 +284,6 @@ class InplaceABNGradOp : public framework::OperatorWithKernel {
     const phi::DenseTensor* t = nullptr;
     if (var->IsType<phi::DenseTensor>()) {
       t = &var->Get<phi::DenseTensor>();
-    } else if (var->IsType<phi::DenseTensor>()) {
-      t = &var->Get<phi::DenseTensor>();
     }
     if (t == nullptr) {
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index a7f6bc512ffce..692b7f0721ceb 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -359,7 +359,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
     bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
 
     IF_CHANGE_FC_TW_TYPENAME((std::is_same<T_in, uint8_t>::value), ([&] {
-                               if (force_fp32_output) {
+                               if (force_fp32_output) {  // NOLINT
                                  this->RunKernel<float, T_w>(ctx);
                                } else if (phi::funcs::is_int8<T_in>()) {
                                  if (fuse_relu) {
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index b7a33edb82a00..3c53b05152b7e 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -105,8 +105,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
         InferShapeSqueezeOp(ctx, x_dims, out_dims);
         break;
       case ReshapeKernelOpName::flatten:
-        InferShapeFlattenOp(ctx, x_dims, out_dims);
-        break;
       case ReshapeKernelOpName::flatten2:
         InferShapeFlattenOp(ctx, x_dims, out_dims);
         break;
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 2e24caa91c6bb..b73ffe4319be7 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -213,10 +213,8 @@ void BufferedReader::ReadAsync(size_t i) {
           auto cpu_ptr = cpu[i].data();
           auto gpu_ptr = gpu_ptrs[i];
           auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype());
-          if (platform::is_cuda_pinned_place(cpu_place)) {
-            memory::Copy(
-                place_, gpu_ptr, cpu_place, cpu_ptr, size, stream_.get());
-          } else if ((platform::is_gpu_place(cpu_place))) {
+          if (platform::is_cuda_pinned_place(cpu_place) ||
+              platform::is_gpu_place(cpu_place)) {
             memory::Copy(
                 place_, gpu_ptr, cpu_place, cpu_ptr, size, stream_.get());
           } else {
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 5cf9fba9f2681..ebb4cd7cf132d 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -76,7 +76,7 @@ class SumOp : public framework::OperatorWithKernel {
       // NOTE(jiahongyu): Below codes originally enclosed by PADDLE_WITH_DNNL
       if (!((data_type == framework::proto::VarType::FP32 ||
              data_type == framework::proto::VarType::BF16) &&
-            ctx.OutputVar("Out")->IsType<phi::DenseTensor>())) {
+            ctx.OutputVar("Out")->IsType<phi::DenseTensor>())) {  // NOLINT
         this->SetDnnFallback(true);
       } else if (!std::all_of(x_vars.begin(),
                               x_vars.end(),
diff --git a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
index a3997ee97db6a..437523e41bf3e 100644
--- a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -196,9 +196,8 @@ void BuildValue(pir::Value value,
                     variable_list);
   }
   // Only support DenseTensor or Vector<DenseTensor>
-  if (!value.type()) {
-    var->GetMutable<phi::DenseTensor>();
-  } else if (value.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
+  if (!value.type() ||
+      value.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
     var->GetMutable<phi::DenseTensor>();
   } else if (value.type().isa<paddle::dialect::AllocatedSelectedRowsType>()) {
     var->GetMutable<phi::SelectedRows>();
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index b8452a594e358..d38d0418e4639 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -62,11 +62,7 @@ bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
     if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
       return true;
-    } else if (is_xpu_place(p1)) {
-      return p1 == p2;
-    } else if (is_ipu_place(p1)) {
-      return p1 == p2;
-    } else if (is_custom_place(p1)) {
+    } else if (is_xpu_place(p1) || is_ipu_place(p1) || is_custom_place(p1)) {
       return p1 == p2;
     } else {
       return p1 == p2;
diff --git a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
index c907be2d10256..c45a473b4a8d3 100644
--- a/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
+++ b/paddle/fluid/prim/api/manual_prim/static_prim_api.cc
@@ -50,8 +50,6 @@ Tensor full<DescTensor>(const IntArray& shape,
   op->SetAttr("shape", shape.GetData());
   switch (dtype) {
     case phi::DataType::FLOAT16:
-      op->SetAttr("str_value", std::to_string(value.to<float>()));
-      break;
     case phi::DataType::BFLOAT16:
       op->SetAttr("str_value", std::to_string(value.to<float>()));
       break;
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 59ef86423788a..e72f5dc77f99c 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1617,7 +1617,8 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
           py::isinstance<py::int_>(value_obj_tmp) ||
           py::isinstance<py::bool_>(value_obj_tmp) ||
           PyComplex_Check(value_obj)) {
-        if (self->tensor.dtype() == phi::DataType::FLOAT32) {
+        if (self->tensor.dtype() == phi::DataType::FLOAT32 ||
+            self->tensor.dtype() == phi::DataType::FLOAT16) {
           attrs["values"] = std::vector<paddle::experimental::Scalar>{
               value_obj_tmp.cast<float>()};
         } else if (self->tensor.dtype() == phi::DataType::FLOAT64) {
@@ -1632,9 +1633,6 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
         } else if (self->tensor.dtype() == phi::DataType::BOOL) {
           attrs["values"] = std::vector<paddle::experimental::Scalar>{
               value_obj_tmp.cast<bool>()};
-        } else if (self->tensor.dtype() == phi::DataType::FLOAT16) {
-          attrs["values"] = std::vector<paddle::experimental::Scalar>{
-              value_obj_tmp.cast<float>()};
         } else if (self->tensor.dtype() == phi::DataType::COMPLEX64) {
           attrs["values"] = std::vector<paddle::experimental::Scalar>{
               value_obj_tmp.cast<std::complex<float>>()};
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 59ecee2c5d668..517c210830022 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -92,13 +92,11 @@ Tensor's type.
 
 PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
   EAGER_TRY
-  if (!self->tensor.defined()) {
+  if (!self->tensor.defined() || self->tensor.is_dense_tensor()) {
     // be same to old dygraph
     return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
   }
-  if (self->tensor.is_dense_tensor()) {
-    return ToPyObject(paddle::framework::proto::VarType::LOD_TENSOR);
-  } else if (self->tensor.is_selected_rows()) {
+  if (self->tensor.is_selected_rows()) {
     return ToPyObject(paddle::framework::proto::VarType::SELECTED_ROWS);
   } else if (egr::IsVariableCompatTensor(self->tensor)) {
     return ToPyObject(static_cast<paddle::framework::proto::VarType::Type>(
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 0432ca88d6ada..87660d9fd88ca 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -173,13 +173,11 @@ bool PyObject_CheckIRVectorOfOpResult(PyObject* obj) {
   }
 }
 bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos) {
-  if (obj == Py_None) {
+  if (obj == Py_None || obj == Py_False) {
     return false;  // To be compatible with QA integration testing. Some
                    // test cases pass in None.
   } else if (obj == Py_True) {
     return true;
-  } else if (obj == Py_False) {
-    return false;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "argument (position %d) must be "
@@ -1125,9 +1123,8 @@ static paddle::Tensor& GetTensorFromPyObject(const std::string& op_type,
     return emptytensor;
   }
 
-  if (PyObject_TypeCheck(obj, p_tensor_type)) {
-    return reinterpret_cast<TensorObject*>(obj)->tensor;
-  } else if (PyObject_TypeCheck(obj, p_string_tensor_type)) {
+  if (PyObject_TypeCheck(obj, p_tensor_type) ||
+      PyObject_TypeCheck(obj, p_string_tensor_type)) {
     return reinterpret_cast<TensorObject*>(obj)->tensor;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index b1fbf43aac8b6..bd569f328b115 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -239,7 +239,7 @@ void PaddleInferTensorCreate(paddle_infer::Tensor &tensor,  // NOLINT
 
 paddle_infer::PlaceType ToPaddleInferPlace(
     phi::AllocationType allocation_type) {
-  if (allocation_type == phi::AllocationType::CPU) {
+  if (allocation_type == phi::AllocationType::CPU) {  // NOLINT
     return paddle_infer::PlaceType::kCPU;
   } else if (allocation_type == phi::AllocationType::GPU) {
     return paddle_infer::PlaceType::kGPU;
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 366465e6b2984..9d8074628fb13 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -121,13 +121,11 @@ bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); }
 bool CastPyArg2Boolean(PyObject* obj,
                        const std::string& op_type,
                        ssize_t arg_pos) {
-  if (obj == Py_None) {
+  if (obj == Py_None || obj == Py_False) {
     return false;  // To be compatible with QA integration testing. Some
                    // test case pass in None.
   } else if (obj == Py_True) {
     return true;
-  } else if (obj == Py_False) {
-    return false;
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index d82b37328850f..d4c5de0dbe6dc 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -67,7 +67,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #endif
 #ifdef PADDLE_WITH_DNNL
-    case phi::Backend::ONEDNN:
+    case phi::Backend::ONEDNN:  // NOLINT
       return phi::CPUPlace();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index d58decadfadca..f9c1dca46b2fb 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -63,9 +63,8 @@ KernelFactory& KernelFactory::Instance() {
 
 bool KernelFactory::HasCompatiblePhiKernel(const std::string& op_type) const {
   if (deprecated_op_names.find(op_type) == deprecated_op_names.end()) {
-    if (phi::OpUtilsMap::Instance().Contains(op_type)) {
-      return true;
-    } else if (kernels_.find(op_type) != kernels_.end()) {
+    if (phi::OpUtilsMap::Instance().Contains(op_type) ||
+        (kernels_.find(op_type) != kernels_.end())) {
       return true;
     }
   }
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index aa1b6526cd5f8..e0df80157013e 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1130,7 +1130,7 @@ void ExpandInferMeta(const MetaTensor& x,
       std::max(static_cast<size_t>(x_dims.size()), expand_shape.size());
   std::vector<int64_t> out_shape(out_rank);
   for (int i = 0; i < static_cast<int>(expand_shape.size()); ++i) {
-    if (x_dims[i] == -1) {
+    if (x_dims[i] == -1) {  // NOLINT
       out_shape[i] = -1;
     } else if (expand_shape[i] == -1) {
       if (static_cast<int>(x_dims.size()) > i) {
diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
index 5ccb5ad8c43b4..d8383b45beb79 100644
--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -63,10 +63,8 @@ void DiagonalGradKernel(const Context& dev_ctx,
     idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_));
 
     bool flag = false;
-    if (offset_ == 0 && axis1_dim == axis2_dim) {
-      idx_dim.push_back(axis1_dim);
-      flag = true;
-    } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) {
+    if ((offset_ == 0 && axis1_dim == axis2_dim) ||
+        (offset_ > 0 && (axis1_dim + offset_) == axis2_dim)) {
       idx_dim.push_back(axis1_dim);
       flag = true;
     } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) {
diff --git a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
index 2e468ef2d07ff..e9764035613ed 100644
--- a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
@@ -52,13 +52,7 @@ void ClipTiledBoxes(const phi::CPUContext& ctx,
   T im_h =
       is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0];
   for (int64_t i = 0; i < input_boxes.numel(); ++i) {
-    if (i % 4 == 0) {
-      out_data[i] =
-          std::max(std::min(input_boxes_data[i], im_w - offset), zero);
-    } else if (i % 4 == 1) {
-      out_data[i] =
-          std::max(std::min(input_boxes_data[i], im_h - offset), zero);
-    } else if (i % 4 == 2) {
+    if ((i % 4 == 0) || (i % 4 == 2)) {
       out_data[i] =
           std::max(std::min(input_boxes_data[i], im_w - offset), zero);
     } else {
diff --git a/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc
index 0ca3be62a3971..fac19f142dffc 100644
--- a/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/send_ue_recv_grad_kernel.cc
@@ -256,7 +256,7 @@ void CalculateEGrad(const T* out_grad_data,
       for (int64_t j = 0; j < bcast.out_len; j++) {
         int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j;
         int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j;
-        if (message_op == "ADD") {
+        if (message_op == "ADD") {  // NOLINT
 #ifdef PADDLE_WITH_MKLML
 #pragma omp atomic
 #endif
@@ -283,7 +283,7 @@ void CalculateEGrad(const T* out_grad_data,
       for (int64_t j = 0; j < bcast.out_len; j++) {
         int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j;
         int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j;
-        if (message_op == "ADD") {
+        if (message_op == "ADD") {  // NOLINT
 #ifdef PADDLE_WITH_MKLML
 #pragma omp atomic
 #endif
diff --git a/paddle/phi/kernels/funcs/vol2col.cc b/paddle/phi/kernels/funcs/vol2col.cc
index 0f411b8894ce9..e505fcb3de337 100644
--- a/paddle/phi/kernels/funcs/vol2col.cc
+++ b/paddle/phi/kernels/funcs/vol2col.cc
@@ -66,7 +66,7 @@ class Vol2ColFunctor<phi::CPUContext, T> {
 
     // changed
     bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_forth = paddings[0];
     int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
     int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
     int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
@@ -191,7 +191,7 @@ class Col2VolFunctor<phi::CPUContext, T> {
         input_channels * filter_depth * filter_height * filter_width;
 
     bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
+    int pad_d_forth = paddings[0];
     int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
     int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
     int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];

From 4c856f9d714999ade2cea66728c8e498067c5c1d Mon Sep 17 00:00:00 2001
From: Xianduo Li <30922914+lxd-cumt@users.noreply.github.com>
Date: Thu, 21 Sep 2023 16:33:11 +0800
Subject: [PATCH 037/115] [PRIM][PIR]Migrate prim rules (#57554)

* fix bugs of generating Op::Build when Op has optional tensor

* add default constructor for IrMetaTensor

* fix bugs

* polish guard

* pir support prim gelu and rsqrt

* support prim bwd ops

* migrate vjp rules of cast,add,multiply,elementwise_pow

* add cast as primitive op

* fix bugs in elementwise_pow_grad

* add test for cast_grad

* add test for elementwise_add_grad

* add test for elementwise_mul_grad

* add test for elementwise_pow_grad

* fix bugs

* fix bugs

* support pir prim backward ops

* refien

* fix bug

* migrate layer_norm custom vjp rules to pir

* fix bugs in ir_backward

* fix backward , scope, and concat_grad prim

* add layer_norm fwd decompose logic

* fix pow

* change _use_new_ir_api to in_pir_mode

* add _static_guard

* fix

* fix executor cuda700 error caused by full and full_like

* refine

* add vjp rules

* fix bugs

* add scope

* add test

* add add op prim rules

---------

Co-authored-by: YuanRisheng <yuanrisheng@baidu.com>
Co-authored-by: cyber-pioneer <chenzhuo@tju.edu.cn>
Co-authored-by: Charles-hit <wanghao107@baidu.com>
Co-authored-by: zhangbo9674 <zhangbo54@baidu.com>
---
 paddle/fluid/primitive/codegen/gen.py         |  18 +-
 .../rule/vjp/generated/generated_vjp.cc.j2    |   2 +-
 paddle/fluid/primitive/primitive.yaml         |   1 +
 paddle/fluid/primitive/rule/vjp/details.h     | 389 ++++++++++++++++--
 paddle/fluid/pybind/ir.cc                     |   4 +
 paddle/phi/api/yaml/legacy_backward.yaml      |   2 +-
 python/paddle/autograd/ir_backward.py         |   5 +-
 python/paddle/decomposition/rules.py          |  80 ++++
 python/paddle/tensor/creation.py              |   9 +-
 test/legacy_test/prim_op_test.py              |   9 +-
 test/legacy_test/test_activation_op.py        |  79 +++-
 test/legacy_test/test_cast_op.py              |  10 +-
 test/legacy_test/test_concat_op.py            | 162 +++++++-
 test/legacy_test/test_elementwise_add_op.py   |  10 +
 test/legacy_test/test_elementwise_mul_op.py   |  19 +-
 test/legacy_test/test_elementwise_pow_op.py   |  19 +-
 test/legacy_test/test_layer_norm_op.py        |  56 ++-
 test/legacy_test/test_reshape_op.py           |  19 +-
 test/legacy_test/test_split_op.py             |  25 +-
 test/legacy_test/test_sum_op.py               |  31 +-
 test/legacy_test/test_transpose_op.py         |  40 +-
 21 files changed, 882 insertions(+), 107 deletions(-)

diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index f9a920730967d..e0eeeb10a3a4d 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -72,8 +72,20 @@
 ]
 
 
-PRIM_VJP = ['divide_grad', 'sum_grad']  # vjp list of primitive op
-CUSTOM_VJP = ['gelu_grad']  # custom vjp list of composite op
+PRIM_VJP = [
+    'divide_grad',
+    'sum_grad',
+    'cast_grad',
+    'add_grad',
+    'multiply_grad',
+    'elementwise_pow_grad',
+    'reshape_grad',
+    'split_grad',
+    'tanh_grad',
+    'transpose_grad',
+    'concat_grad',
+]  # vjp list of primitive op
+CUSTOM_VJP = ['gelu_grad', 'layer_norm_grad']  # custom vjp list of composite op
 VJP_COMPS = PRIM_VJP + CUSTOM_VJP
 
 BACKENDS = [
@@ -149,6 +161,8 @@
     'embedding_grad',
     'sqrt',
     'uniform',
+    'split',
+    'transpose',
 ]
 
 
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
index 1ab275ceaecbf..6737a73d69eb5 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
@@ -106,7 +106,7 @@ paddle::Tensor* {{api.outputs[i].name}} = !stop_gradients[{{i}}][0] ? &vjp_res[{
     {% else %}
 std::vector<paddle::Tensor*> {{api.outputs[i].name}}(stop_gradients[{{i}}].size(), nullptr);
 for (size_t i=0; i< stop_gradients[{{i}}].size(); i++ ) {
-  {{api.outputs[i].name}} =  !stop_gradients[{{i}}][i] ?  &vjp_res[{{i}}][i] : nullptr;
+  {{api.outputs[i].name}}[i] =  !stop_gradients[{{i}}][i] ?  &vjp_res[{{i}}][i] : nullptr;
 }
     {% endif %}
   {% endfor %}
diff --git a/paddle/fluid/primitive/primitive.yaml b/paddle/fluid/primitive/primitive.yaml
index a42e2503e31ba..ccf9673bafba0 100644
--- a/paddle/fluid/primitive/primitive.yaml
+++ b/paddle/fluid/primitive/primitive.yaml
@@ -49,3 +49,4 @@
 - erf
 - tanh
 - full
+- cast
diff --git a/paddle/fluid/primitive/rule/vjp/details.h b/paddle/fluid/primitive/rule/vjp/details.h
index eb640a4643ed3..96b4d051b7cde 100644
--- a/paddle/fluid/primitive/rule/vjp/details.h
+++ b/paddle/fluid/primitive/rule/vjp/details.h
@@ -134,32 +134,371 @@ void gelu_grad(const Tensor& x,
   // Promote to fp32 when the input type is fp16 for keeping consistent with
   // phi kernel
 
-  // Scale only support fp32 attr in static graph mode, use elementwise_xx
-  // when precision is over fp32.
-  if (approximate) {
-    auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
-    auto kKappa = 0.044715;
-    auto x_sq = x * x;
-    auto x_cube = x_sq * x;
-    auto inner = kBeta * (x + kKappa * x_cube);
-    auto tanh_inner = tanh<T>(inner);
-
-    auto left = scale<T>(x, 0.5);
-    auto right = scale<T>(tanh_inner, 1., 1.);
-
-    auto left_derivative = scale<T>(right, 0.5);
-
-    auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
-    auto inner_derivative = kBeta * (scale<T>(3 * kKappa * x_sq, 1., 1.));
-    auto right_derivative = left * tanh_derivative * inner_derivative;
-
-    set_output<T>(out_grad * (left_derivative + right_derivative), x_grad);
+  if (x.dtype() == phi::DataType::FLOAT16 ||
+      x.dtype() == phi::DataType::BFLOAT16) {
+    auto promoted_x = cast<T>(x, phi::DataType::FLOAT32);
+    auto promoted_out_grad = cast<T>(out_grad, phi::DataType::FLOAT32);
+    if (approximate) {
+      float kbeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+      float kkappa = 0.044715;
+      auto x_sq = promoted_x * promoted_x;
+      auto x_cube = x_sq * promoted_x;
+      auto inner = kbeta * (promoted_x + kkappa * x_cube);
+      auto tanh_inner = tanh<T>(inner);
+
+      auto left = scale<T>(promoted_x, 0.5);
+      auto right = scale<T>(tanh_inner, 1., 1.);
+
+      auto left_derivative = scale<T>(right, 0.5);
+
+      auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
+      auto inner_derivative = kbeta * (scale<T>(3 * kkappa * x_sq, 1., 1.));
+      auto right_derivative = left * tanh_derivative * inner_derivative;
+
+      set_output<T>(
+          cast<T>(promoted_out_grad * (left_derivative + right_derivative),
+                  x.type()),
+          x_grad);
+    } else {
+      float kalpha = M_SQRT1_2;
+      float kbeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+      auto cdf = scale<T>(scale<T>(erf<T>(kalpha * promoted_x), 1., 1.), 0.5);
+      auto pdf = kbeta * exp<T>(scale<T>(promoted_x * promoted_x, -0.5));
+      set_output<T>(
+          cast<T>(promoted_out_grad * (cdf + promoted_x * pdf), x.type()),
+          x_grad);
+    }
   } else {
-    auto kAlpha = M_SQRT1_2;
-    auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
-    auto cdf = scale<T>(scale<T>(erf<T>(kAlpha * x), 1., 1.), 0.5);
-    auto pdf = kBeta * exp<T>(scale<T>(x * x, -0.5));
-    set_output<T>(out_grad * (cdf + x * pdf), x_grad);
+    // Scale only support fp32 attr in static graph mode, use elementwise_xx
+    // when precision is over fp32.
+    if (approximate) {
+      auto kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+      auto kKappa = 0.044715;
+      auto x_sq = x * x;
+      auto x_cube = x_sq * x;
+      auto inner = kBeta * (x + kKappa * x_cube);
+      auto tanh_inner = tanh<T>(inner);
+
+      auto left = scale<T>(x, 0.5);
+      auto right = scale<T>(tanh_inner, 1., 1.);
+
+      auto left_derivative = scale<T>(right, 0.5);
+
+      auto tanh_derivative = scale<T>(tanh_inner * tanh_inner, -1., 1.);
+      auto inner_derivative = kBeta * (scale<T>(3 * kKappa * x_sq, 1., 1.));
+      auto right_derivative = left * tanh_derivative * inner_derivative;
+
+      set_output<T>(out_grad * (left_derivative + right_derivative), x_grad);
+    } else {
+      auto kAlpha = M_SQRT1_2;
+      auto kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+      auto cdf = scale<T>(scale<T>(erf<T>(kAlpha * x), 1., 1.), 0.5);
+      auto pdf = kBeta * exp<T>(scale<T>(x * x, -0.5));
+      set_output<T>(out_grad * (cdf + x * pdf), x_grad);
+    }
+  }
+}
+
+template <typename T>
+void reshape_grad(const Tensor& x, const Tensor& grad_out, Tensor* grad_x) {
+  if (grad_x) {
+    auto grad_x_tmp = reshape<T>(grad_out, phi::vectorize(x.dims()));
+    set_output<T>(grad_x_tmp, grad_x);
+  }
+}
+
+template <typename T>
+void transpose_grad(const Tensor& grad_out,
+                    const std::vector<int>& perm,
+                    Tensor* grad_x) {
+  if (grad_x) {
+    std::vector<int> reverse_perm(perm);
+    // make origin ranks
+    for (int i = 0; i < static_cast<int>(perm.size()); ++i) {
+      if (perm[i] >= 0) {
+        reverse_perm[perm[i]] = i;
+      } else {
+        reverse_perm[perm[i] + perm.size()] = i;
+      }
+    }
+    auto grad_x_tmp = transpose<T>(grad_out, reverse_perm);
+    set_output<T>(grad_x_tmp, grad_x);
+  }
+}
+
+template <typename T>
+void tanh_grad(const Tensor& out, const Tensor& grad_out, Tensor* grad_x) {
+  if (!grad_x) return;
+  auto grad_x_tmp = grad_out * (1 - out * out);
+  set_output<T>(grad_x_tmp, grad_x);
+}
+
+template <typename T>
+void concat_grad(const std::vector<Tensor>& x,
+                 const Tensor& out_grad,
+                 const Scalar& axis,
+                 std::vector<Tensor*> x_grad) {
+  int axis_value = axis.to<int>();
+  int rank = x[0].dims().size();
+  if (axis_value < 0) {
+    axis_value = axis_value + rank;
+  }
+  axis_value = axis_value > 0 ? axis_value : 0;
+  std::vector<int> sections;
+  int x_num = x.size();
+  for (int i = 0; i < x_num; ++i) {
+    sections.push_back(x[i].dims()[axis_value]);
+  }
+  std::vector<Tensor> x_grad_tmp =
+      split<T>(out_grad, IntArray(sections), axis_value);
+  for (int i = 0; i < x_num; ++i) {
+    if (x_grad[i]) {
+      set_output<T>(x_grad_tmp.at(i), x_grad.at(i));
+    }
+  }
+}
+
+template <typename T>
+void split_grad(const std::vector<Tensor>& out_grad,
+                const Scalar& axis,
+                Tensor* x_grad) {
+  if (x_grad) {
+    auto grad = concat<T>(out_grad, axis);
+    set_output<T>(grad, x_grad);
+  }
+}
+
+template <typename T>
+void cast_grad(const Tensor& x, const Tensor& out_grad, Tensor* x_grad) {
+  if (x_grad) {
+    auto res = cast<T>(out_grad, x.dtype());
+    set_output<T>(res, x_grad);
+  }
+}
+
+template <typename T>
+void add_grad(const Tensor& x,
+              const Tensor& y,
+              const Tensor& out_grad,
+              int axis,
+              Tensor* dx,
+              Tensor* dy) {
+  if (dy) {
+    if (x.dims() != y.dims()) {
+      // Maybe need reduce here
+      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
+      if (!reduce_dim.size()) {
+        set_output<T>(out_grad, dy);
+      } else {
+        auto dy_reduce_res =
+            out_grad.sum(phi::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+        set_output<T>(dy_tmp, dy);
+      }
+
+    } else {
+      set_output<T>(out_grad, dy);
+    }
+  }
+  if (dx) {
+    if (y.dims() != x.dims()) {
+      // Maybe need reduce here
+      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
+      if (!reduce_dim.size()) {
+        set_output<T>(out_grad, dx);
+      } else {
+        auto dx_reduce_res =
+            out_grad.sum(phi::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+        set_output<T>(dx_tmp, dx);
+      }
+    } else {
+      set_output<T>(out_grad, dx);
+    }
+  }
+}
+
+template <typename T>
+void multiply_grad(const Tensor& x,
+                   const Tensor& y,
+                   const Tensor& out_grad,
+                   int axis,
+                   Tensor* x_grad,
+                   Tensor* y_grad) {
+  if (x_grad) {
+    auto x_grad_unreduce = out_grad * y;
+    if (x_grad_unreduce.dims() != x.dims()) {
+      auto axes = get_reduce_dims_from_out(x_grad_unreduce.dims(), x.dims());
+      if (!axes.size()) {
+        set_output<T>(x_grad_unreduce, x_grad);
+      } else {
+        auto x_grad_reduced = x_grad_unreduce.sum(
+            phi::vectorize(axes), x_grad_unreduce.dtype(), false);
+        if (x_grad_reduced.dims().size() != x.dims().size()) {
+          x_grad_reduced = reshape<T>(x_grad_reduced, x.shape());
+        }
+        set_output<T>(x_grad_reduced, x_grad);
+      }
+    } else {
+      set_output<T>(x_grad_unreduce, x_grad);
+    }
+  }
+  if (y_grad) {
+    auto y_grad_unreduce = out_grad * x;
+    if (y_grad_unreduce.dims() != y.dims()) {
+      auto axes = get_reduce_dims_from_out(y_grad_unreduce.dims(), y.dims());
+      if (!axes.size()) {
+        set_output<T>(y_grad_unreduce, y_grad);
+      } else {
+        auto y_grad_reduced = y_grad_unreduce.sum(
+            phi::vectorize(axes), y_grad_unreduce.dtype(), false);
+        if (y_grad_reduced.dims().size() != y.dims().size()) {
+          y_grad_reduced = reshape<T>(y_grad_reduced, y.shape());
+        }
+        set_output<T>(y_grad_reduced, y_grad);
+      }
+    } else {
+      set_output<T>(y_grad_unreduce, y_grad);
+    }
+  }
+}
+
+template <typename T>
+void elementwise_pow_grad(const Tensor& x,
+                          const Tensor& y,
+                          const Tensor& out_grad,
+                          Tensor* dx,
+                          Tensor* dy) {
+  if (dy) {
+    // dy = lnx * x^y
+    auto lnx = log<T>(x);
+    auto x_pow_y = elementwise_pow<T>(x, y);
+    auto dy_res = lnx * x_pow_y * out_grad;
+    if (x.dims() != y.dims()) {
+      // Maybe need reduce here
+      phi::DDim reduce_dim = get_reduce_dims(y.dims(), x.dims());
+      if (!reduce_dim.size()) {
+        set_output<T>(dy_res, dy);
+      } else {
+        auto dy_reduce_res =
+            dy_res.sum(phi::vectorize(reduce_dim), y.dtype(), false);
+        auto dy_tmp = reshape<T>(dy_reduce_res, phi::vectorize(y.dims()));
+        set_output<T>(dy_tmp, dy);
+      }
+    } else {
+      set_output<T>(dy_res, dy);
+    }
+  }  // indicate we will compute dy
+  if (dx) {
+    // dx = y * x^(y-1)
+    auto tmp_z = y - 1.0;
+    auto x_pow_z = elementwise_pow<T>(x, tmp_z);
+    auto dx_res = y * x_pow_z * out_grad;
+    if (y.dims() != x.dims()) {
+      // Maybe need reduce here
+      auto reduce_dim = get_reduce_dims(x.dims(), y.dims());
+      if (!reduce_dim.size()) {
+        set_output<T>(dx_res, dx);
+      } else {
+        auto dx_reduce_res =
+            dx_res.sum(phi::vectorize(reduce_dim), x.dtype(), false);
+        auto dx_tmp = reshape<T>(dx_reduce_res, phi::vectorize(x.dims()));
+        set_output<T>(dx_tmp, dx);
+      }
+
+    } else {
+      set_output<T>(dx_res, dx);
+    }
+  }  // indicate we will compute dx
+}
+
+template <typename T>
+void layer_norm_grad(const Tensor& x,
+                     const paddle::optional<Tensor>& scale,
+                     const paddle::optional<Tensor>& bias,
+                     const Tensor& mean,
+                     const Tensor& variance,
+                     const Tensor& out_grad,
+                     float epsilon,
+                     int begin_norm_axis,
+                     Tensor* x_grad,
+                     Tensor* scale_grad,
+                     Tensor* bias_grad) {
+  auto x_dims = x.dims();
+  auto shape_1 = 1;  // front part
+  auto shape_2 = 1;  // back part
+  for (int i = 0; i < begin_norm_axis; ++i) {
+    shape_1 *= x_dims[i];
+  }
+  for (int i = begin_norm_axis; i < x.dims().size(); ++i) {
+    shape_2 *= x_dims[i];
+  }
+  auto scale_ptr = scale.get_ptr();
+  auto bias_ptr = bias.get_ptr();
+
+  auto x_cast = reshape<T>(x, std::vector<int64_t>({shape_1, shape_2}));
+  auto out_grad_cast =
+      reshape<T>(out_grad, std::vector<int64_t>({shape_1, shape_2}));
+  auto mean_ = reshape<T>(mean, std::vector<int64_t>({shape_1, 1}));
+  auto variance_ = reshape<T>(variance, std::vector<int64_t>({shape_1, 1}));
+
+  Tensor scale_cast;
+  if (scale_ptr) {
+    scale_cast = reshape<T>(*scale_ptr, std::vector<int64_t>({1, shape_2}));
+  }
+
+  // cast dtype to float32 if dtype =float16 or bfloat16
+
+  auto x_sub_mean = x_cast - mean_;          // M,N
+  auto tmp = (1.0 / (variance_ + epsilon));  // M,1
+  // auto sqrt_var_1 = sqrt<T>(tmp);            // M,1
+  auto sqrt_var_1 = elementwise_pow<T>(
+      tmp, full<T>(phi::vectorize(tmp.dims()), 0.5, tmp.dtype()));
+  auto x_sub_mean_mul_sqrt_var_1 = x_sub_mean * sqrt_var_1;
+
+  if (x_grad) {
+    auto out_grad_scale = out_grad_cast;  // M,N
+    if (scale_ptr) {
+      out_grad_scale = out_grad_cast * scale_cast;  // M,N * 1,N = M,N
+    }
+
+    auto dx_end = sqrt_var_1 * out_grad_scale;
+    auto d_mean =
+        dx_end.sum(std::vector<int64_t>({1}), x_cast.dtype(), true);  // M,1
+
+    auto d_std_1 =
+        (tmp * x_sub_mean * out_grad_scale)
+            .sum(std::vector<int64_t>({1}), x_cast.dtype(), true);  // M,1
+    auto d_std = d_std_1 * x_sub_mean_mul_sqrt_var_1;  // M,1 * M,N = M,N
+
+    auto d_mean_d_std = (1.0 / shape_2) * (d_mean + d_std);
+    auto x_grad_tmp = dx_end - d_mean_d_std;
+    x_grad_tmp = reshape<T>(x_grad_tmp, phi::vectorize(x.dims()));
+
+    set_output<T>(x_grad_tmp, x_grad);
+  }
+
+  if (scale_grad) {
+    if (scale_ptr) {
+      auto scale_grad_tmp =
+          (x_sub_mean_mul_sqrt_var_1 * out_grad_cast)
+              .sum(std::vector<int64_t>({0}), x_cast.dtype(), true);
+      scale_grad_tmp = reshape<T>(scale_grad_tmp, scale_ptr->shape());
+      set_output<T>(scale_grad_tmp, scale_grad);
+    } else {
+      scale_grad = nullptr;
+    }
+  }
+
+  if (bias_grad) {
+    if (bias_ptr) {
+      auto bias_grad_tmp =
+          out_grad_cast.sum(std::vector<int64_t>({0}), x_cast.dtype(), true);
+      bias_grad_tmp = reshape<T>(bias_grad_tmp, bias_ptr->shape());
+      set_output<T>(bias_grad_tmp, bias_grad);
+    } else {
+      bias_grad = nullptr;
+    }
   }
 }
 
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 22fd0f40a36b5..80ecad93997db 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -464,6 +464,10 @@ void BindOpResult(py::module *m) {
            [](OpResult &self, OpResult &other) {
              return paddle::dialect::add(self, other);
            })
+      .def("__add__",
+           [](OpResult &self, float &bias) {
+             return paddle::dialect::scale(self, 1.0, bias, false);
+           })
       .def("__sub__",
            [](OpResult &self, OpResult &other) {
              return paddle::dialect::subtract(self, other);
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index a8260bb816865..9b5db92c54700 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -224,7 +224,7 @@
   infer_meta :
     func : GeneralBinaryGradInferMeta
     param: [x, y]
-  composite : elementwise_pow_grad(x, y, out_grad, axis, x_grad, y_grad)
+  composite : elementwise_pow_grad(x, y, out_grad, x_grad, y_grad)
   kernel :
     func : elementwise_pow_grad
 
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index e33c3a38bff74..f8a2aae71b0cd 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -94,7 +94,6 @@ def prepare_grad_outputs(grad_outputs, outputs, state):
                 dtype=output.dtype,
             )
             fillop = output_grad.get_defining_op()
-
             update_bwdop_structure(
                 backward_ops,
                 state.op_to_opgrad[output.get_defining_op()],
@@ -138,14 +137,14 @@ def prepare_grad_outputs(grad_outputs, outputs, state):
                     0.0,
                     opresult.dtype,
                 )
-                fillop = grad.get_defining_op()
+                fillop = grad_value.get_defining_op()
 
                 update_bwdop_structure(
                     backward_ops,
                     state.op_to_opgrad[opresult.get_defining_op()],
                     fillop,
                 )
-                state.value_to_valuegrad[opresult] = [grad_value]
+                state.value_to_valuegrad[opresult] = [[grad_value]]
 
                 visited_output.add(opresult)
 
diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py
index e9d04ede061ce..26a4ae73debd0 100644
--- a/python/paddle/decomposition/rules.py
+++ b/python/paddle/decomposition/rules.py
@@ -63,3 +63,83 @@ def gelu_composite(x, approximate):
         cdf = half * (one + _ir_ops.erf(x * full(x.shape, M_SQRT1_2, x.dtype)))
         out = x * cdf
         return out
+
+
+@register_decomp('pd_op.rsqrt')
+def rsqrt_composite(x):
+    """define composite rule of op rsqrt."""
+    # rsqrt(x) = x^(-0.5)
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+    y = full(x.shape if len(x.shape) == 0 else [1], -0.5, x.dtype)
+    res = pow(x, y)
+    return res if not is_amp else cast(res, dtype)
+
+
+@register_decomp('pd_op.pow')
+def pow_composite(x, y):
+    """
+    define composite rule of op pow
+    res = x^y
+    """
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+
+    if isinstance(y, (int, float)):
+        y = full(x.shape if len(x.shape) == 0 else [1], y, x.dtype)
+    res = pow(x, y)
+    if is_amp:
+        res = cast(res, dtype)
+    return res
+
+
+@register_decomp('pd_op.layer_norm')
+def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis):
+    """
+    define composite rule of op layer_norm
+    out = (x - mean(x)) / sqrt(var + epsilon))
+    var = mean((x-mean(x))^2)
+    """
+    is_amp = False
+    from paddle.base.data_feeder import convert_dtype
+
+    dtype = convert_dtype(x.dtype)
+    if dtype in ["float16", "uint16"]:
+        is_amp = True
+        x = cast(x, "float32")
+        scale = cast(scale, "float32") if scale else scale
+        bias = cast(bias, "float32") if bias else bias
+
+    axis = tuple(range(begin_norm_axis, len(x.shape)))
+    mean_ = mean(x, axis=axis, keepdim=True)
+    difference = x - mean_
+    var_tmp1 = difference * difference
+    variance = mean(var_tmp1, axis=axis, keepdim=True)
+    var_tmp3 = variance + epsilon
+    rsqrt_var = rsqrt(var_tmp3)
+    out = difference * rsqrt_var
+
+    if scale is not None:
+        if x.shape[begin_norm_axis:] != scale.shape:
+            scale = reshape(scale, x.shape[begin_norm_axis:])
+        out = out * scale
+    if bias is not None:
+        if x.shape[begin_norm_axis:] != bias.shape:
+            bias = reshape(bias, x.shape[begin_norm_axis:])
+        out = out + bias
+
+    mean_ = reshape(mean_, [-1])
+    variance = reshape(variance, [-1])
+    if is_amp:
+        out = cast(out, dtype)
+    return out, mean_, variance
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index c3e814cc906d4..f764fbb45996d 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -833,8 +833,7 @@ def full_like(x, fill_value, dtype=None, name=None):
     if in_dynamic_mode():
         return _C_ops.full_like(x, fill_value, dtype, x.place)
     elif in_pir_mode():
-        place = _current_expected_place()
-        return _C_ops.full_like(x, fill_value, dtype, place)
+        return _C_ops.full_like(x, fill_value, dtype, core.Place())
     else:
         helper = LayerHelper("full_like", **locals())
         check_variable_and_dtype(
@@ -881,7 +880,11 @@ def full_like(x, fill_value, dtype=None, name=None):
 
 def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     if in_dynamic_or_pir_mode():
-        place = _current_expected_place()
+        place = (
+            _current_expected_place()
+            if not in_pir_mode()
+            else paddle.base.core.Place()
+        )
         if force_cpu:
             place = core.CPUPlace()
         if isinstance(shape, (list, tuple)):
diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py
index e472c70813c73..f28957cdc89be 100644
--- a/test/legacy_test/prim_op_test.py
+++ b/test/legacy_test/prim_op_test.py
@@ -22,7 +22,8 @@
 
 import paddle
 from paddle.autograd.ir_backward import grad as ir_grad
-from paddle.base import core
+from paddle.base import Scope, core
+from paddle.base.executor import scope_guard
 from paddle.base.framework import (
     OpProtoHolder,
     _dygraph_tracer,
@@ -409,7 +410,8 @@ def check(self):
                 self.check_jit_comp_with_cinn()
         else:
             if self.enable_check_static_comp:
-                self.check_static_comp()
+                with scope_guard(Scope()):
+                    self.check_static_comp()
 
     def get_kernel_sig(self):
         with dygraph_guard():
@@ -870,7 +872,8 @@ def check(self):
                 self.check_jit_comp_with_cinn()
         else:
             if self.enable_check_static_comp:
-                self.check_static_comp()
+                with scope_guard(Scope()):
+                    self.check_static_comp()
 
     def get_output_dict(self, np_outputs, api_outputs, outputs_sig):
         assert len(api_outputs) <= len(outputs_sig), (
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 8b16ee5750eac..8d1ee1ac5091a 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -693,9 +693,21 @@ def test_check_grad(self):
             return
         # TODO(ScottWong98): set `check_prim=False` when `fill_any_like` supports `complex` dtype
         if self.dtype == np.complex64 or self.dtype == np.complex128:
-            self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=False)
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=False,
+                check_prim_pir=False,
+                check_new_ir=False,
+            )
         else:
-            self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
+            )
 
     def init_dtype(self):
         # TODO If dtype is float64, the output (Out) has diff at CPUPlace
@@ -1615,7 +1627,9 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True, check_new_ir=True, check_prim_pir=True
+        )
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -1626,6 +1640,7 @@ def test_check_grad(self):
             max_relative_error=0.0005,
             check_prim=True,
             check_new_ir=True,
+            check_prim_pir=True,
         )
 
 
@@ -2480,12 +2495,22 @@ def setUp(self):
         self.cinn_atol = 1e-8
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=False,
+        )
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGelu(TestActivation):
@@ -2518,12 +2543,20 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True, check_new_ir=True, check_prim_pir=False
+        )
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestGelu_ZeroDim(TestGelu):
@@ -3575,12 +3608,20 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True, check_prim_pir=True, check_new_ir=True
+        )
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestPow_ZeroDim(TestPow):
@@ -4397,6 +4438,7 @@ def create_test_act_fp16_class(
     grad_check=True,
     check_dygraph=True,
     check_prim=False,
+    check_prim_pir=False,
     enable_cinn=False,
     grad_atol=1e-2,
     **kwargs
@@ -4425,6 +4467,7 @@ def test_check_output(self):
                     atol=atol,
                     check_dygraph=check_dygraph,
                     check_prim=check_prim,
+                    check_prim_pir=check_prim_pir,
                 )
 
         def test_check_grad(self):
@@ -4437,6 +4480,7 @@ def test_check_grad(self):
                     'Out',
                     check_dygraph=check_dygraph,
                     check_prim=check_prim,
+                    check_prim_pir=check_prim_pir,
                     max_relative_error=grad_atol,
                 )
 
@@ -4451,7 +4495,9 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestSigmoid, check_prim=True, enable_cinn=True)
 create_test_act_fp16_class(TestSilu, check_prim=True, enable_cinn=True)
 create_test_act_fp16_class(TestLogSigmoid)
-create_test_act_fp16_class(TestTanh, check_prim=True, enable_cinn=True)
+create_test_act_fp16_class(
+    TestTanh, check_prim=True, check_prim_pir=True, enable_cinn=True
+)
 create_test_act_fp16_class(TestTanhshrink)
 create_test_act_fp16_class(TestHardShrink)
 create_test_act_fp16_class(TestSoftshrink)
@@ -4478,6 +4524,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(
     TestGelu,
     check_prim=True,
+    check_prim_pir=True,
     check_new_ir=True,
     enable_cinn=True,
     rev_comp_rtol=1e-3,
@@ -4499,7 +4546,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestLog10)
 create_test_act_fp16_class(TestLog1p)
 create_test_act_fp16_class(TestSquare)
-create_test_act_fp16_class(TestPow, check_prim=True)
+create_test_act_fp16_class(TestPow, check_prim=True, check_prim_pir=True)
 create_test_act_fp16_class(TestPow_API)
 create_test_act_fp16_class(TestSTanh)
 create_test_act_fp16_class(TestSoftplus)
@@ -4521,7 +4568,11 @@ def test_check_grad(self):
 )
 create_test_act_fp16_class(TestLeakyRelu_ZeroDim, check_prim=True)
 create_test_act_fp16_class(
-    TestRsqrt, check_prim=True, enable_cinn=True, check_new_ir=True
+    TestRsqrt,
+    check_prim=True,
+    enable_cinn=True,
+    check_new_ir=True,
+    check_prim_pir=True,
 )
 
 
@@ -4645,7 +4696,9 @@ def test_check_grad(self):
 create_test_act_bf16_class(TestLeakyReluAlpha2, check_prim=True)
 create_test_act_bf16_class(TestLeakyReluAlpha3, check_prim=True)
 create_test_act_bf16_class(TestLeakyRelu_ZeroDim, check_prim=True)
-create_test_act_bf16_class(TestRsqrt, check_prim=True, check_new_ir=True)
+create_test_act_bf16_class(
+    TestRsqrt, check_prim=True, check_new_ir=True, check_prim_pir=True
+)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
index 47bc23d76f601..448629431d0b1 100644
--- a/test/legacy_test/test_cast_op.py
+++ b/test/legacy_test/test_cast_op.py
@@ -52,10 +52,16 @@ def init_shapes(self):
         self.input_shape = [10, 10]
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_prim_pir=True, check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            ['Out'],
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestCastOpFp32ToFp64_ZeroDim(TestCastOpFp32ToFp64):
diff --git a/test/legacy_test/test_concat_op.py b/test/legacy_test/test_concat_op.py
index dc9702beeb014..153e1cc06d308 100644
--- a/test/legacy_test/test_concat_op.py
+++ b/test/legacy_test/test_concat_op.py
@@ -61,18 +61,51 @@ def test_check_grad(self):
         if self.dtype == np.uint16:
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
-                place, ['x0'], 'Out', check_prim=True, check_new_ir=True
+                place,
+                ['x0'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
             )
             self.check_grad_with_place(
-                place, ['x1'], 'Out', check_prim=True, check_new_ir=True
+                place,
+                ['x1'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
             )
             self.check_grad_with_place(
-                place, ['x2'], 'Out', check_prim=True, check_new_ir=True
+                place,
+                ['x2'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
             )
         else:
-            self.check_grad(['x0'], 'Out', check_prim=True, check_new_ir=True)
-            self.check_grad(['x1'], 'Out', check_prim=True, check_new_ir=True)
-            self.check_grad(['x2'], 'Out', check_prim=True, check_new_ir=True)
+            self.check_grad(
+                ['x0'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
+            )
+            self.check_grad(
+                ['x1'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
+            )
+            self.check_grad(
+                ['x2'],
+                'Out',
+                check_prim=True,
+                check_new_ir=True,
+                check_prim_pir=True,
+            )
 
     def init_test_data(self):
         if self.dtype == np.uint16:
@@ -213,9 +246,27 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', check_prim=True, check_new_ir=True)
-        self.check_grad(['x1'], 'Out', check_prim=True, check_new_ir=True)
-        self.check_grad(['x2'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['x0'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
+        self.check_grad(
+            ['x1'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
+        self.check_grad(
+            ['x2'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
     def init_test_data(self):
         if self.dtype == np.uint16:
@@ -301,8 +352,10 @@ def create_test_fp16(parent):
     class TestConcatFp16(parent):
         def setUp(self):
             self.op_type = "concat"
+            self.prim_op_type = "prim"
             self.python_api = paddle.concat
             self.public_python_api = paddle.concat
+            self.enable_cinn = False
             self.dtype = self.get_dtype()
             self.init_test_data()
             self.inputs = {
@@ -332,18 +385,51 @@ def test_check_grad(self):
             if self.dtype == np.uint16:
                 place = core.CUDAPlace(0)
                 self.check_grad_with_place(
-                    place, ['x0'], 'Out', check_new_ir=True
+                    place,
+                    ['x0'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
                 self.check_grad_with_place(
-                    place, ['x1'], 'Out', check_new_ir=True
+                    place,
+                    ['x1'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
                 self.check_grad_with_place(
-                    place, ['x2'], 'Out', check_new_ir=True
+                    place,
+                    ['x2'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
             else:
-                self.check_grad(['x0'], 'Out', check_new_ir=True)
-                self.check_grad(['x1'], 'Out', check_new_ir=True)
-                self.check_grad(['x2'], 'Out', check_new_ir=True)
+                self.check_grad(
+                    ['x0'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
+                self.check_grad(
+                    ['x1'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
+                self.check_grad(
+                    ['x2'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
 
         def get_dtype(self):
             return np.float16
@@ -371,6 +457,7 @@ def create_test_bf16(parent):
     class TestConcatBf16(parent):
         def setUp(self):
             self.op_type = "concat"
+            self.prim_op_type = "prim"
             self.python_api = paddle.concat
             self.public_python_api = paddle.concat
             self.enable_cinn = False
@@ -403,18 +490,51 @@ def test_check_grad(self):
             if self.dtype == np.uint16:
                 place = core.CUDAPlace(0)
                 self.check_grad_with_place(
-                    place, ['x0'], 'Out', check_new_ir=True
+                    place,
+                    ['x0'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
                 self.check_grad_with_place(
-                    place, ['x1'], 'Out', check_new_ir=True
+                    place,
+                    ['x1'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
                 self.check_grad_with_place(
-                    place, ['x2'], 'Out', check_new_ir=True
+                    place,
+                    ['x2'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
                 )
             else:
-                self.check_grad(['x0'], 'Out', check_new_ir=True)
-                self.check_grad(['x1'], 'Out', check_new_ir=True)
-                self.check_grad(['x2'], 'Out', check_new_ir=True)
+                self.check_grad(
+                    ['x0'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
+                self.check_grad(
+                    ['x1'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
+                self.check_grad(
+                    ['x2'],
+                    'Out',
+                    check_new_ir=True,
+                    check_prim=True,
+                    check_prim_pir=True,
+                )
 
         def get_dtype(self):
             return np.uint16
diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index 8bacfc9a45cfd..546e9d2555421 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -57,6 +57,7 @@ def test_check_output(self):
         self.check_output(
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -69,6 +70,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -82,6 +84,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -95,6 +98,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -152,6 +156,7 @@ def test_check_output(self):
             atol=1e-3,
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_prim_pir=self.check_dygraph(),
             check_new_ir=self.check_dygraph(),
         )
 
@@ -167,6 +172,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -178,6 +184,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -221,6 +228,7 @@ def test_check_grad_normal(self):
             ['X', 'Y'],
             'Out',
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -232,6 +240,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -243,6 +252,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index 8013eb0baaf15..fde11e09fbe14 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -49,6 +49,7 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output(
             check_dygraph=(not self.use_mkldnn),
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -59,6 +60,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -70,6 +72,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -81,6 +84,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -102,6 +106,7 @@ def if_enable_cinn(self):
 class TestComplexElementwiseMulOpWithCheckGrad(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.prim_op_type = "prim"
         self.python_api = paddle.multiply
         self.public_python_api = paddle.multiply
         self.dtype = np.complex128
@@ -188,7 +193,13 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X', 'Y'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -196,6 +207,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -205,6 +217,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -420,6 +433,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -431,6 +445,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -442,6 +457,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_prim_pir=(not self.use_mkldnn),
             check_new_ir=(not self.use_mkldnn),
         )
 
@@ -496,6 +512,7 @@ def setUp(self):
 class TestComplexElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
+        self.prim_op_type = "prim"
         self.python_api = paddle.multiply
         self.init_base_dtype()
         self.init_input_output()
diff --git a/test/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py
index e406845960abc..c718ce16292b9 100644
--- a/test/legacy_test/test_elementwise_pow_op.py
+++ b/test/legacy_test/test_elementwise_pow_op.py
@@ -44,7 +44,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_prim_pir=True, check_new_ir=True)
 
     def test_check_grad_normal(self):
         if hasattr(self, 'attrs'):
@@ -53,7 +53,11 @@ def test_check_grad_normal(self):
             )
         else:
             self.check_grad(
-                ['X', 'Y'], 'Out', check_prim=True, check_new_ir=True
+                ['X', 'Y'],
+                'Out',
+                check_prim=True,
+                check_prim_pir=True,
+                check_new_ir=True,
             )
 
 
@@ -190,6 +194,8 @@ class TestElementwisePowOpInt(OpTest):
     def setUp(self):
         self.op_type = "elementwise_pow"
         self.python_api = paddle.pow
+        self.public_python_api = paddle.pow
+        self.prim_op_type = "prim"
 
         self.inputs = {'X': np.asarray([1, 3, 6]), 'Y': np.asarray([1, 1, 1])}
         self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
@@ -198,7 +204,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_prim_pir=True, check_new_ir=True)
 
 
 class TestElementwisePowGradOpInt(unittest.TestCase):
@@ -254,7 +260,7 @@ def test_check_output(self):
         if hasattr(self, 'attrs'):
             self.check_output(check_dygraph=False)
         else:
-            self.check_output(check_new_ir=True)
+            self.check_output(check_prim_pir=True, check_new_ir=True)
 
     def test_check_grad(self):
         self.check_grad(
@@ -264,6 +270,7 @@ def test_check_grad(self):
                 self.inputs['X'], self.inputs['Y'], 1 / self.inputs['X'].size
             ),
             check_prim=True,
+            check_prim_pir=True,
             check_new_ir=True,
         )
 
@@ -290,7 +297,7 @@ def setUp(self):
         self.outputs = {'Out': convert_float_to_uint16(out)}
 
     def test_check_output(self):
-        self.check_output(check_new_ir=True)
+        self.check_output(check_prim_pir=True, check_new_ir=True)
 
     def test_check_grad(self):
         self.check_grad(['X', 'Y'], 'Out')
@@ -301,7 +308,7 @@ def test_check_grad(self):
                 'Out',
                 check_prim=True,
                 only_check_prim=True,
-                check_new_ir=True,
+                check_prim_pir=True,
             )
 
 
diff --git a/test/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py
index b023ff6488e48..3fb01bb3d0b62 100644
--- a/test/legacy_test/test_layer_norm_op.py
+++ b/test/legacy_test/test_layer_norm_op.py
@@ -141,8 +141,9 @@ def test_check_output(self):
             no_check_set=["Mean", "Variance"],
             atol=self.ori_atol,
             rtol=self.ori_rtol,
-            check_prim=True,
-            check_new_ir=True,
+            check_prim=self.check_prim,
+            check_prim_pir=self.check_prim_pir,
+            check_new_ir=self.check_new_ir,
         )
 
     def test_check_grad(self):
@@ -150,8 +151,9 @@ def test_check_grad(self):
             self.check_grad_input_list,
             ['Y'],
             max_relative_error=self.max_relative_error,
-            check_prim=True,
-            check_new_ir=True,
+            check_prim=self.check_prim,
+            check_prim_pir=self.check_prim_pir,
+            check_new_ir=self.check_new_ir,
         )
 
     def initConfig(self):
@@ -173,6 +175,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = True
+        self.check_prim = True
+        self.check_prim_pir = True
+        self.check_new_ir = True
 
     def initTestCase(self):
         np.random.seed(123)
@@ -240,8 +245,9 @@ def test_check_output(self):
             no_check_set=["Mean", "Variance"],
             atol=self.ori_atol,
             rtol=self.ori_rtol,
-            check_prim=True,
-            check_new_ir=True,
+            check_prim=self.check_prim,
+            check_prim_pir=self.check_prim_pir,
+            check_new_ir=self.check_new_ir,
         )
 
     def test_check_grad(self):
@@ -250,8 +256,9 @@ def test_check_grad(self):
             self.check_grad_input_list,
             ['Y'],
             max_relative_error=self.max_relative_error,
-            check_prim=True,
-            check_new_ir=True,
+            check_prim=self.check_prim,
+            check_prim_pir=self.check_prim_pir,
+            check_new_ir=self.check_new_ir,
         )
 
     def initConfig(self):
@@ -266,6 +273,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = True
+        self.check_prim = True
+        self.check_prim_pir = True
+        self.check_new_ir = True
 
     def initTestCase(self):
         np.random.seed(123)
@@ -335,6 +345,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -356,6 +369,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -382,6 +398,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -403,6 +422,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -429,6 +451,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = True
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 @unittest.skipIf(
@@ -450,6 +475,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = True
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 class TestLayerNormOpByOpTestFP32(TestLayerNormOpByOpTest):
@@ -467,6 +495,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = True
+        self.check_prim = True
+        self.check_prim_pir = True
+        self.check_new_ir = True
 
 
 class TestLayerNormOpByOpTestFP32_case2(TestLayerNormOpByOpTest):
@@ -484,6 +515,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 class TestLayerNormOpByOpTestFP32_case3(TestLayerNormOpByOpTest):
@@ -501,6 +535,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = True
         self.has_bias = False
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 class TestLayerNormOpByOpTestFP32_case4(TestLayerNormOpByOpTest):
@@ -518,6 +555,9 @@ def initConfig(self):
         self.begin_norm_axis = 1
         self.has_scale = False
         self.has_bias = True
+        self.check_prim = False
+        self.check_prim_pir = False
+        self.check_new_ir = True
 
 
 class TestLayerNormOp(unittest.TestCase):
diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
index c9ab6baf41ef6..0a9132ca55b49 100755
--- a/test/legacy_test/test_reshape_op.py
+++ b/test/legacy_test/test_reshape_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
@@ -43,11 +43,17 @@ def init_data(self):
         self.new_shape = (12, 10)
         self.infered_shape = (12, 10)
 
-    def test_check_output(self):
+    def _test_check_output(self):
         self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
 
 class TestReshapeOp_ZeroDim1(TestReshapeOp):
@@ -120,7 +126,7 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True)
+        self.check_grad(["X"], "Out", check_prim=True, check_prim_pir=True)
 
 
 class TestReshapeFP16Op(OpTest):
@@ -148,7 +154,7 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True)
+        self.check_grad(["X"], "Out", check_prim=True, check_prim_pir=True)
 
 
 class TestReshapeOpDimInfer1(TestReshapeOp):
@@ -340,6 +346,9 @@ def init_dtype(self):
         self.dtype = np.uint8
 
 
+@skip_check_grad_ci(
+    "we don't need to check grad for the bool type of reshape op"
+)
 class TestReshapeOpBool(TestReshapeOp):
     def setUp(self):
         self.init_data()
diff --git a/test/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py
index 964e127aafb81..92dfe72f8443e 100644
--- a/test/legacy_test/test_split_op.py
+++ b/test/legacy_test/test_split_op.py
@@ -61,7 +61,11 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.check_grad(
-            ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True
+            ['X'],
+            ['out0', 'out1', 'out2'],
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
         )
 
 
@@ -117,7 +121,11 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.check_grad(
-            ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True
+            ['X'],
+            ['out0', 'out1', 'out2'],
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
         )
 
 
@@ -243,7 +251,11 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.check_grad(
-            ['X'], ['out0', 'out1', 'out2'], check_prim=True, check_new_ir=True
+            ['X'],
+            ['out0', 'out1', 'out2'],
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
         )
 
 
@@ -291,7 +303,12 @@ def test_check_output(self):
         def test_check_grad(self):
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
-                place, ['X'], 'out2', check_prim=True, check_new_ir=True
+                place,
+                ['X'],
+                'out2',
+                check_prim=True,
+                check_prim_pir=True,
+                check_new_ir=True,
             )
 
     cls_name = "{}_{}".format(parent.__name__, "BF16Op")
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index 63a68442936ab..c154625fb51f4 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -58,11 +58,20 @@ def init_kernel_type(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_prim=True, check_cinn=True, check_new_ir=True)
+        self.check_output(
+            check_prim=True,
+            check_cinn=True,
+            check_new_ir=True,
+        )
 
     def test_check_grad(self):
         self.check_grad(
-            ['x0'], 'Out', check_prim=True, check_cinn=True, check_new_ir=True
+            ['x0'],
+            'Out',
+            check_prim=True,
+            check_cinn=True,
+            check_prim_pir=True,
+            check_new_ir=True,
         )
 
 
@@ -304,7 +313,13 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad(['x0'], 'Out', check_cinn=True, check_new_ir=True)
+            self.check_grad(
+                ['x0'],
+                'Out',
+                check_cinn=True,
+                check_prim_pir=True,
+                check_new_ir=True,
+            )
 
 
 def create_test_sum_fp16_class(parent):
@@ -330,7 +345,9 @@ def test_w_is_selected_rows(self):
 class TestSumBF16Op(OpTest):
     def setUp(self):
         self.op_type = "sum"
+        self.prim_op_type = "prim"
         self.python_api = paddle.add_n
+        self.public_python_api = paddle.add_n
         self.init_kernel_type()
         x0 = np.random.random((3, 40)).astype(np.float32)
         x1 = np.random.random((3, 40)).astype(np.float32)
@@ -354,7 +371,13 @@ def test_check_output(self):
 
     def test_check_grad(self):
         # new dynamic graph mode does not support unit16 type
-        self.check_grad(['x0'], 'Out', check_dygraph=False, check_new_ir=True)
+        self.check_grad(
+            ['x0'],
+            'Out',
+            check_dygraph=False,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class API_Test_Add_n(unittest.TestCase):
diff --git a/test/legacy_test/test_transpose_op.py b/test/legacy_test/test_transpose_op.py
index c8d91f59f8c49..52f85ef1e0a70 100644
--- a/test/legacy_test/test_transpose_op.py
+++ b/test/legacy_test/test_transpose_op.py
@@ -52,7 +52,13 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_pir=True,
+        )
 
     def if_enable_cinn(self):
         pass
@@ -209,7 +215,13 @@ def test_check_output(self):
         base.core.disable_autotune()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestAutoTuneTransposeFP16Op(OpTest):
@@ -246,7 +258,13 @@ def test_check_output(self):
         base.core.disable_autotune()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestAutoTuneTransposeBF16Op(OpTest):
@@ -290,7 +308,13 @@ def test_check_output(self):
         base.core.disable_autotune()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestTransposeFP16Op(OpTest):
@@ -325,7 +349,13 @@ def test_check_output(self):
         self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
     def initTestCase(self):
         self.shape = (3, 40)

From 6e5c978878e401b9d383de91078f82520fa40cf1 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Thu, 21 Sep 2023 18:35:15 +0800
Subject: [PATCH 038/115] =?UTF-8?q?=E3=80=90pir=E3=80=91Modify=20comment?=
 =?UTF-8?q?=20of=20pr57478=20and=20pr56873=20(#57520)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* tmp

* reply comment

* code style
---
 .../fluid/pir/dialect/op_generator/api_gen.py |  2 +-
 .../pir/dialect/op_generator/python_c_gen.py  |  2 +-
 .../pir/dialect/operator/ir/manual_api.cc     | 23 ++++++++++---------
 .../pir/dialect/operator/ir/manual_api.h      | 21 +++++++++--------
 .../pir/dialect/operator/ir/manual_op_vjp.cc  |  4 +++-
 .../primitive/backend/manual/manual_backend.h |  1 -
 6 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
index d7e74f72b652f..851f318e9bc47 100644
--- a/paddle/fluid/pir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -150,7 +150,7 @@ def _gen_api_inputs(self, op_info):
         assert len(name_list) == len(type_list)
         ret = []
         for name, type in zip(name_list, type_list):
-            ret.append(f'{self._type_map[type]} {name}')
+            ret.append(f'const {self._type_map[type]}& {name}')
         return ', '.join(ret)
 
     def _gen_api_attrs(
diff --git a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
index 440f656b99964..adb5270e975e6 100644
--- a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
@@ -174,7 +174,7 @@
 """
 
 BUILTIN_STACK_OP_TEMPLATE = """
-            {name} = paddle::dialect::stack({name}_tmp, 0);
+            {name} = paddle::dialect::stack({name}_tmp, /*axis*/0);
 """
 TYPE_TO_FUNC_MAP = {
     "bool": "CastPyArg2Boolean",
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
index 24e7a94b66650..eb5acbf2388ea 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
@@ -28,8 +28,8 @@ pir::OpResult builtin_combine(const std::vector<pir::Value>& x) {
   return combine_op.out();
 }
 
-std::vector<pir::OpResult> add_n_grad(std::vector<pir::Value> inputs,
-                                      pir::Value out_grad) {
+std::vector<pir::OpResult> add_n_grad(const std::vector<pir::Value>& inputs,
+                                      const pir::Value& out_grad) {
   std::vector<pir::OpResult> inputs_grad;
   for (size_t i = 0; i < inputs.size(); i++) {
     paddle::dialect::ScaleOp scale_op =
@@ -40,8 +40,8 @@ std::vector<pir::OpResult> add_n_grad(std::vector<pir::Value> inputs,
   return inputs_grad;
 }
 
-pir::OpResult zeros_like(pir::Value x,
-                         phi::DataType dtype,
+pir::OpResult zeros_like(const pir::Value& x,
+                         const phi::DataType dtype,
                          const Place& place) {
   return paddle::dialect::full_like(x, 0, dtype, place);
 }
@@ -54,7 +54,7 @@ pir::OpResult get_parameter(const std::string& name) {
   return get_parameter_op.result(0);
 }
 
-void set_parameter(pir::Value parameter, const std::string& name) {
+void set_parameter(const pir::Value& parameter, const std::string& name) {
   std::unique_ptr<pir::Parameter> param(
       new pir::Parameter(nullptr, 0, parameter.type()));
   APIBuilder::Instance().SetParameter(name, std::move(param));
@@ -62,9 +62,9 @@ void set_parameter(pir::Value parameter, const std::string& name) {
                                                                   name);
 }
 
-pir::OpResult embedding_grad(pir::Value x,
-                             pir::Value weight,
-                             pir::Value out_grad,
+pir::OpResult embedding_grad(const pir::Value& x,
+                             const pir::Value& weight,
+                             const pir::Value& out_grad,
                              int64_t padding_idx,
                              bool sparse) {
   if (weight.type().isa<paddle::dialect::DenseTensorType>()) {
@@ -81,7 +81,8 @@ pir::OpResult embedding_grad(pir::Value x,
   }
 }
 
-pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad, int axis) {
+pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
+                                  int axis) {
   auto out_grad_combine_op =
       APIBuilder::Instance().GetBuilder()->Build<pir::CombineOp>(out_grad);
   paddle::dialect::SplitGradOp split_grad_op =
@@ -90,8 +91,8 @@ pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad, int axis) {
   return split_grad_op.result(0);
 }
 
-pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad,
-                                  pir::Value axis) {
+pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
+                                  const pir::Value& axis) {
   auto out_grad_combine_op =
       APIBuilder::Instance().GetBuilder()->Build<pir::CombineOp>(out_grad);
   paddle::dialect::SplitGradOp split_grad_op =
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
index c919448f1ddb0..fe579295ad5a0 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_api.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
@@ -25,26 +25,27 @@ namespace dialect {
 
 pir::OpResult builtin_combine(const std::vector<pir::Value>& x);
 
-std::vector<pir::OpResult> add_n_grad(std::vector<pir::Value> inputs,
-                                      pir::Value out_grad);
+std::vector<pir::OpResult> add_n_grad(const std::vector<pir::Value>& inputs,
+                                      const pir::Value& out_grad);
 
-pir::OpResult zeros_like(pir::Value x,
+pir::OpResult zeros_like(const pir::Value& x,
                          phi::DataType dtype = phi::DataType::UNDEFINED,
                          const Place& place = {});
 
 pir::OpResult get_parameter(const std::string& name);
 
-void set_parameter(pir::Value parameter, const std::string& name);
+void set_parameter(const pir::Value& parameter, const std::string& name);
 
-pir::OpResult embedding_grad(pir::Value x,
-                             pir::Value weight,
-                             pir::Value out_grad,
+pir::OpResult embedding_grad(const pir::Value& x,
+                             const pir::Value& weight,
+                             const pir::Value& out_grad,
                              int64_t padding_idx = -1,
                              bool sparse = false);
 
-pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad, int axis);
+pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
+                                  int axis);
 
-pir::OpResult split_with_num_grad(std::vector<pir::Value> out_grad,
-                                  pir::Value axis);
+pir::OpResult split_with_num_grad(const std::vector<pir::Value>& out_grad,
+                                  const pir::Value& axis);
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
index b6d131e5411fb..80c13ac89def1 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
@@ -34,7 +34,9 @@ std::vector<std::vector<pir::OpResult>> AddNOp::Vjp(
   AddNOp op_obj = op->dyn_cast<AddNOp>();
 
   VLOG(6) << "Prepare inputs of add_n_grad";
-
+  PADDLE_ENFORCE(
+      op_obj.inputs() != nullptr,
+      paddle::platform::errors::Fatal("addn op's inputs can't be null"));
   pir::CombineOp combine_op_obj = op_obj.inputs()
                                       .dyn_cast<pir::OpResult>()
                                       .owner()
diff --git a/paddle/fluid/primitive/backend/manual/manual_backend.h b/paddle/fluid/primitive/backend/manual/manual_backend.h
index 16c1facbd5354..3c9340164ac01 100644
--- a/paddle/fluid/primitive/backend/manual/manual_backend.h
+++ b/paddle/fluid/primitive/backend/manual/manual_backend.h
@@ -18,7 +18,6 @@
 #include <vector>
 
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/utils/optional.h"
 
 namespace paddle {
 namespace primitive {

From 69ad1735436555288b1adb88f731cd67ef8240d9 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Thu, 21 Sep 2023 18:39:12 +0800
Subject: [PATCH 039/115] [SOT][3.11] fix eval frame for python 3.11 (#57490)

* [SOT] fix eval frame for python 3.11

* fix missing `()`

* fix no Paddle_PyInterpreterFrameProxyType in < 3.11

* `Paddle_PyInterpreterFrameProxy` -> `PyInterpreterFrameProxy`

* compat for eval_custom_code

* clean callback result is None logic

* refine internal API name

* refine comments
---
 paddle/fluid/pybind/jit.cc | 364 ++++++++++++++++++++++++++++---------
 1 file changed, 275 insertions(+), 89 deletions(-)

diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc
index 69b32fca9cd75..688fe7c670370 100644
--- a/paddle/fluid/pybind/jit.cc
+++ b/paddle/fluid/pybind/jit.cc
@@ -21,7 +21,14 @@ limitations under the License. */
 #include <code.h>
 #endif
 #if PY_VERSION_HEX >= 0x030b0000
+#include <internal/pycore_code.h>
 #include <internal/pycore_frame.h>
+#define Py_BUILD_CORE       // internal/pycore_opcode.h need this macro
+#define NEED_OPCODE_TABLES  // To get _PyOpcode_Caches and _PyOpcode_Deopt
+#include <internal/pycore_opcode.h>
+#undef NEED_OPCODE_TABLES
+#undef Py_BUILD_CORE
+#include <opcode.h>
 #endif
 
 #include <object.h>
@@ -49,64 +56,181 @@ namespace pybind {
 // that we don't need any modification in eval_frame functions.
 typedef _PyInterpreterFrame FrameObject;
 #define CALL_STAT_INC(name) ((void)0)
-PyFrameObject *Paddle_PyFrame_New_NoTrack(PyCodeObject *code) {
-  CALL_STAT_INC(frame_objects_created);
-  int slots = code->co_nlocalsplus + code->co_stacksize;
-  PyFrameObject *f = PyObject_GC_NewVar(PyFrameObject, &PyFrame_Type, slots);
-  if (f == NULL) {
-    return NULL;
+
+// clang-format off
+// Define a proxy PyObject to access _PyInterpreterFrame's properties.
+// It will be passed as an argument to the eval frame's callback.
+typedef struct PyInterpreterFrameProxy {
+  PyObject_HEAD
+  _PyInterpreterFrame *frame;
+} PyInterpreterFrameProxy;
+// clang-format on
+
+#define DECLARE_PROXY_PROPERTY(name)                        \
+  static PyObject *PyInterpreterFrameProxy_property_##name( \
+      PyInterpreterFrameProxy *self, void *closure) {       \
+    Py_XINCREF(self->frame->name);                          \
+    return reinterpret_cast<PyObject *>(self->frame->name); \
+  }
+
+// clang-format off
+#define REGISTER_PROXY_PROPERTY(name)                                         \
+  {                                                                           \
+    #name, (getter)PyInterpreterFrameProxy_property_##name, nullptr, nullptr, \
+        nullptr                                                               \
+  }
+// clang-format on
+
+DECLARE_PROXY_PROPERTY(f_code)
+DECLARE_PROXY_PROPERTY(f_locals)
+DECLARE_PROXY_PROPERTY(f_globals)
+DECLARE_PROXY_PROPERTY(f_builtins)
+
+static PyGetSetDef PyInterpreterFrameProxy_properties[] = {
+    REGISTER_PROXY_PROPERTY(f_code),
+    REGISTER_PROXY_PROPERTY(f_locals),
+    REGISTER_PROXY_PROPERTY(f_globals),
+    REGISTER_PROXY_PROPERTY(f_builtins),
+    {nullptr} /* Sentinel */
+};
+
+// clang-format off
+static PyTypeObject PyInterpreterFrameProxyType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "paddle.framework.core.PyInterpreterFrameProxy",
+    .tp_doc = PyDoc_STR("A proxy object for _PyInterpreterFrame, "
+                        "it's only define all properties we need."),
+    .tp_basicsize = sizeof(PyInterpreterFrameProxy),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_getset = PyInterpreterFrameProxy_properties,
+};
+// clang-format on
+
+PyInterpreterFrameProxy *PyInterpreterFrameProxy_New(
+    _PyInterpreterFrame *frame) {
+  PyTypeObject *type = &PyInterpreterFrameProxyType;
+  PyInterpreterFrameProxy *self =
+      reinterpret_cast<PyInterpreterFrameProxy *>(type->tp_alloc(type, 0));
+  if (!self) {
+    VLOG(7) << "Failed to allocate PyInterpreterFrameProxy";
+    return nullptr;
   }
-  f->f_back = NULL;
-  f->f_trace = NULL;
-  f->f_trace_lines = 1;
-  f->f_trace_opcodes = 0;
-  f->f_fast_as_locals = 0;
-  f->f_lineno = 0;
-  return f;
+  self->frame = frame;
+  return self;
 }
 
-static inline bool Paddle_PyFrame_IsIncomplete(_PyInterpreterFrame *frame) {
-  return frame->owner != FRAME_OWNED_BY_GENERATOR &&
-         frame->prev_instr <
-             _PyCode_CODE(frame->f_code) + frame->f_code->_co_firsttraceable;
+// We copy some cpython internal API from cpython project.
+// To avoid name conflict, we use "Internal_" prefix to mark them.
+static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame,
+                                         int opcode,
+                                         int oparg) {
+  // This only works when opcode is a non-quickened form:
+  assert(_PyOpcode_Deopt[opcode] == opcode);
+  int check_oparg = 0;
+  for (_Py_CODEUNIT *instruction = _PyCode_CODE(frame->f_code);
+       instruction < frame->prev_instr;
+       instruction++) {
+    int check_opcode = _PyOpcode_Deopt[_Py_OPCODE(*instruction)];
+    check_oparg |= _Py_OPARG(*instruction);
+    if (check_opcode == opcode && check_oparg == oparg) {
+      return 1;
+    }
+    if (check_opcode == EXTENDED_ARG) {
+      check_oparg <<= 8;
+    } else {
+      check_oparg = 0;
+    }
+    instruction += _PyOpcode_Caches[check_opcode];
+  }
+  return 0;
 }
 
-PyFrameObject *Paddle_PyFrame_MakeAndSetFrameObject(
-    _PyInterpreterFrame *frame) {
-  assert(frame->frame_obj == NULL);
-  PyObject *error_type, *error_value, *error_traceback;
-  PyErr_Fetch(&error_type, &error_value, &error_traceback);
-
-  PyFrameObject *f = Paddle_PyFrame_New_NoTrack(frame->f_code);
-  if (f == NULL) {
-    Py_XDECREF(error_type);
-    Py_XDECREF(error_value);
-    Py_XDECREF(error_traceback);
-    return NULL;  // NOLINT
+int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
+  /* Merge fast locals into f->f_locals */
+  PyObject *locals;
+  PyObject **fast;
+  PyCodeObject *co;
+  locals = frame->f_locals;
+  if (locals == NULL) {
+    locals = frame->f_locals = PyDict_New();
+    if (locals == NULL) return -1;
   }
-  PyErr_Restore(error_type, error_value, error_traceback);
-  if (frame->frame_obj) {
-    f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data;  // NOLINT
-    f->f_frame->owner = FRAME_CLEARED;
-    f->f_frame->frame_obj = f;
-    Py_DECREF(f);
-    return frame->frame_obj;
+  co = frame->f_code;
+  fast = _PyFrame_GetLocalsArray(frame);
+  // COPY_FREE_VARS has no quickened forms, so no need to use _PyOpcode_Deopt
+  // here:
+  int lasti = _PyInterpreterFrame_LASTI(frame);
+  if (lasti < 0 && _Py_OPCODE(_PyCode_CODE(co)[0]) == COPY_FREE_VARS) {
+    /* Free vars have not been initialized -- Do that */
+    PyCodeObject *co = frame->f_code;
+    PyObject *closure = frame->f_func->func_closure;
+    int offset = co->co_nlocals + co->co_nplaincellvars;
+    for (int i = 0; i < co->co_nfreevars; ++i) {
+      PyObject *o = PyTuple_GET_ITEM(closure, i);
+      Py_INCREF(o);
+      frame->localsplus[offset + i] = o;
+    }
+    // COPY_FREE_VARS doesn't have inline CACHEs, either:
+    frame->prev_instr = _PyCode_CODE(frame->f_code);
   }
-  assert(frame->owner != FRAME_OWNED_BY_FRAME_OBJECT);
-  assert(frame->owner != FRAME_CLEARED);
-  f->f_frame = frame;
-  frame->frame_obj = f;
-  return f;
-}
+  for (int i = 0; i < co->co_nlocalsplus; i++) {
+    _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
+
+    /* If the namespace is unoptimized, then one of the
+       following cases applies:
+       1. It does not contain free variables, because it
+          uses import * or is a top-level namespace.
+       2. It is a class namespace.
+       We don't want to accidentally copy free variables
+       into the locals dict used by the class.
+    */
+    if (kind & CO_FAST_FREE && !(co->co_flags & CO_OPTIMIZED)) {
+      continue;
+    }
 
-static inline PyFrameObject *Paddle_PyFrame_GetFrameObject(
-    _PyInterpreterFrame *frame) {
-  assert(!Paddle_PyFrame_IsIncomplete(frame));
-  PyFrameObject *res = frame->frame_obj;
-  if (res != NULL) {
-    return res;
+    PyObject *name = PyTuple_GET_ITEM(co->co_localsplusnames, i);
+    PyObject *value = fast[i];
+    if (frame->stacktop) {
+      if (kind & CO_FAST_FREE) {
+        // The cell was set by COPY_FREE_VARS.
+        assert(value != NULL && PyCell_Check(value));
+        value = PyCell_GET(value);
+      } else if (kind & CO_FAST_CELL) {
+        // Note that no *_DEREF ops can happen before MAKE_CELL
+        // executes.  So there's no need to duplicate the work
+        // that MAKE_CELL would otherwise do later, if it hasn't
+        // run yet.
+        if (value != NULL) {
+          if (PyCell_Check(value) &&
+              Internal_PyFrame_OpAlreadyRan(frame, MAKE_CELL, i)) {
+            // (likely) MAKE_CELL must have executed already.
+            value = PyCell_GET(value);
+          }
+          // (likely) Otherwise it it is an arg (kind & CO_FAST_LOCAL),
+          // with the initial value set when the frame was created...
+          // (unlikely) ...or it was set to some initial value by
+          // an earlier call to PyFrame_LocalsToFast().
+        }
+      }
+    } else {
+      assert(value == NULL);
+    }
+    if (value == NULL) {
+      if (PyObject_DelItem(locals, name) != 0) {
+        if (PyErr_ExceptionMatches(PyExc_KeyError)) {
+          PyErr_Clear();
+        } else {
+          return -1;
+        }
+      }
+    } else {
+      if (PyObject_SetItem(locals, name, value) != 0) {
+        return -1;
+      }
+    }
   }
-  return Paddle_PyFrame_MakeAndSetFrameObject(frame);
+  return 0;
 }
 
 #else
@@ -145,37 +269,84 @@ inline static PyObject *eval_frame_default(PyThreadState *tstate,
 #endif
 }
 
-// Start a new frame and run code in this frame.
-// Execute a piece of code by default frame-hook.
-inline static PyObject *eval_custom_code(PyThreadState *tstate,
-                                         FrameObject *frame,
-                                         PyCodeObject *code,
-                                         int throw_flag) {
+#if PY_VERSION_HEX >= 0x030b0000
+
+inline static PyObject *eval_custom_code_py311_plus(PyThreadState *tstate,
+                                                    FrameObject *frame,
+                                                    PyCodeObject *code,
+                                                    int throw_flag) {
+  // Create a new PyInterpreterFrame. Refer to CALL.
+  // PyInterpreterFrame has a head section calls "specials". It follows
+  // a contiguous section containing localplus and interpreter stack space.
+  size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE;
+  CALL_STAT_INC(frames_pushed);
+  _PyInterpreterFrame *shadow = reinterpret_cast<_PyInterpreterFrame *>(
+      malloc(sizeof(PyObject *) * size));
+  if (shadow == nullptr) {
+    VLOG(7) << "Failed to allocate memory for shadow frame.";
+    return nullptr;
+  }
+  // Create a new function object from code object. Refer to MAKE_FUNCTION.
+  PyFunctionObject *func = reinterpret_cast<PyFunctionObject *>(
+      PyFunction_New(reinterpret_cast<PyObject *>(code), frame->f_globals));
+  _PyFrame_InitializeSpecials(shadow, func, nullptr, code->co_nlocalsplus);
+
+  PyObject **fastlocals_old = frame->localsplus;
+  PyObject **fastlocals_new = shadow->localsplus;
+
+  for (size_t i = 0; i < code->co_nlocalsplus; ++i) {
+    fastlocals_new[i] = nullptr;
+  }
+
+  // The namemap to map the name to index in new frame localsplus.
+  PyObject *namemap = PyDict_New();
+  if (namemap == nullptr) {
+    VLOG(7) << "Failed to create namemap.";
+    free(shadow);
+    return nullptr;
+  }
+  for (size_t i = 0; i < code->co_nlocalsplus; ++i) {
+    PyObject *name = PyTuple_GET_ITEM(code->co_localsplusnames, i);
+    PyObject *index = PyLong_FromSize_t(i);
+    PyDict_SetItem(namemap, name, index);
+  }
+  for (size_t i = 0; i < frame->f_code->co_nlocalsplus; ++i) {
+    PyObject *name = PyTuple_GET_ITEM(frame->f_code->co_localsplusnames, i);
+    PyObject *index = PyDict_GetItem(namemap, name);
+    if (index == nullptr) {
+      continue;
+    }
+    Py_XINCREF(fastlocals_old[i]);
+    fastlocals_new[PyLong_AsSize_t(index)] = fastlocals_old[i];
+  }
+
+  PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
+  free(shadow);
+  Py_DECREF(namemap);
+  return result;
+}
+
+#else
+
+inline static PyObject *eval_custom_code_py310_minus(PyThreadState *tstate,
+                                                     FrameObject *frame,
+                                                     PyCodeObject *code,
+                                                     int throw_flag) {
   Py_ssize_t ncells = 0;
   Py_ssize_t nfrees = 0;
   Py_ssize_t nlocals_new = code->co_nlocals;
   Py_ssize_t nlocals_old = frame->f_code->co_nlocals;
 
-#if PY_VERSION_HEX >= 0x030b0000
-  ncells = code->co_ncellvars;
-  nfrees = code->co_nfreevars;
-#else
   ncells = PyTuple_GET_SIZE(code->co_cellvars);
   nfrees = PyTuple_GET_SIZE(code->co_freevars);
-#endif
 
   PyFrameObject *shadow = PyFrame_New(tstate, code, frame->f_globals, nullptr);
   if (shadow == nullptr) {
     return nullptr;
   }
 
-#if PY_VERSION_HEX >= 0x030b0000
-  PyObject **fastlocals_old = frame->localsplus;
-  PyObject **fastlocals_new = shadow->f_frame->localsplus;
-#else
   PyObject **fastlocals_old = frame->f_localsplus;
   PyObject **fastlocals_new = shadow->f_localsplus;
-#endif
 
   for (Py_ssize_t i = 0; i < nlocals_old; i++) {
     Py_XINCREF(fastlocals_old[i]);
@@ -187,15 +358,26 @@ inline static PyObject *eval_custom_code(PyThreadState *tstate,
     fastlocals_new[nlocals_new + i] = fastlocals_old[nlocals_old + i];
   }
 
-#if PY_VERSION_HEX >= 0x030b0000
-  PyObject *result = eval_frame_default(tstate, shadow->f_frame, throw_flag);
-#else
   PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
-#endif
   Py_DECREF(shadow);
   return result;
 }
 
+#endif
+
+// Start a new frame and run code in this frame.
+// Execute a piece of code by default frame-hook.
+inline static PyObject *eval_custom_code(PyThreadState *tstate,
+                                         FrameObject *frame,
+                                         PyCodeObject *code,
+                                         int throw_flag) {
+#if PY_VERSION_HEX >= 0x030b0000
+  return eval_custom_code_py311_plus(tstate, frame, code, throw_flag);
+#else
+  return eval_custom_code_py310_minus(tstate, frame, code, throw_flag);
+#endif
+}
+
 static PyObject *_custom_eval_frame(PyThreadState *tstate,
                                     FrameObject *frame,
                                     int throw_flag,
@@ -203,13 +385,16 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
 // https://peps.python.org/pep-0558/#fast-locals-proxy-implementation-details
 // https://devguide.python.org/internals/interpreter/#all-sorts-of-variables
 #if PY_VERSION_HEX >= 0x030b0000
-  // _PyFrame_GetFrameObject(frame) # this function should be the right answer,
-  // but nm libpython.so | grep _PyFrame_MakeAndSetFrameObject is a `t' symbol,
-  // which means it's local to library. we will get a link error if we use it.
   if (frame->owner == FRAME_OWNED_BY_GENERATOR) {
     return eval_frame_default(tstate, frame, throw_flag);
   }
-  if (PyFrame_FastToLocalsWithError(Paddle_PyFrame_GetFrameObject(frame)) < 0) {
+  // PyFrame_FastToLocalsWithError receives a PyFrameObject, but if we created a
+  // PyFrameObject from a PyInterpreterFrame, it will changes the original
+  // PyInterpreterFrame and causes a Segmentation Fault when Fallback to run
+  // original frame. So we pass a PyInterpreterFrame to
+  // _PyFrame_FastToLocalsWithError directly. But this is an internal API, so we
+  // copy many code from CPython project into our project.
+  if (Internal_PyFrame_FastToLocalsWithError(frame) < 0) {
 #else
   if (PyFrame_FastToLocalsWithError(frame) < 0) {
 #endif
@@ -236,39 +421,38 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
   eval_frame_callback_set(Py_None);
 
 #if PY_VERSION_HEX >= 0x030b0000
-  PyObject *args = Py_BuildValue("(O)", Paddle_PyFrame_GetFrameObject(frame));
+  PyObject *args = Py_BuildValue("(O)", PyInterpreterFrameProxy_New(frame));
 #else
   PyObject *args = Py_BuildValue("(O)", frame);
 #endif
   PyObject *result = PyObject_CallObject(callback, args);
   Py_DECREF(args);
   VLOG(7) << "After call eval_frame_function and decrease frame.";
-  // result: GuardedCode
+  // class CustomCode(Protocal):
+  //     code: CodeType | None
+  //     disable_eval_frame: bool
+  // result: CustomCode
   if (result == nullptr) {
     // internal exception
     VLOG(7) << "Error happened.";
     return nullptr;
-  } else if (result != Py_None) {
+  } else {
     //  NOTE: Cache is not supported now
     PyCodeObject *code = reinterpret_cast<PyCodeObject *>(
         PyObject_GetAttrString(result, "code"));
     PyObject *disable_eval_frame =
         PyObject_GetAttrString(result, "disable_eval_frame");
+    PyObject *out;
+    VLOG(7) << "Start eval new frame and code.";
     if (disable_eval_frame != Py_True) {
       // Re-enable custom behavior
       eval_frame_callback_set(callback);
-      VLOG(7) << "Start eval new frame and code.";
-      PyObject *out;
       if (reinterpret_cast<PyObject *>(code) != Py_None) {
         out = eval_custom_code(tstate, frame, code, throw_flag);
       } else {
         out = eval_frame_default(tstate, frame, throw_flag);
       }
-      Py_DECREF(result);
-      Py_DECREF(code);
-      return out;
     } else {
-      PyObject *out;
       if (reinterpret_cast<PyObject *>(code) != Py_None) {
         out = eval_custom_code(tstate, frame, code, throw_flag);
       } else {
@@ -276,14 +460,10 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
       }
       // Re-enable custom behavior
       eval_frame_callback_set(callback);
-      Py_DECREF(result);
-      Py_DECREF(code);
-      return out;
     }
-  } else {
-    // Re-enable custom behavior
-    eval_frame_callback_set(callback);
-    return eval_frame_default(tstate, frame, throw_flag);
+    Py_DECREF(result);
+    Py_DECREF(code);
+    return out;
   }
 }
 
@@ -414,6 +594,12 @@ void BindEvalFrame(pybind11::module *m) {
         return obj;
       },
       py::arg("callback"));
+#if PY_VERSION_HEX >= 0x030b0000
+  if (PyType_Ready(&PyInterpreterFrameProxyType) < 0) {
+    VLOG(7) << "PyInterpreterFrameProxyType has not been ready!";
+  }
+  Py_INCREF(&PyInterpreterFrameProxyType);
+#endif
 }
 
 }  // namespace pybind

From 177c1397ec774a286fa6a203dd0fa249b685d963 Mon Sep 17 00:00:00 2001
From: kangguangli <kangguangli@hotmail.com>
Date: Thu, 21 Sep 2023 20:20:07 +0800
Subject: [PATCH 040/115] [PIR] register fused_attention in pir (#57557)

* register fused_attention in pir

* fix

* fix
---
 .../pir/dialect/op_generator/ops_api_gen.py   |   1 +
 paddle/fluid/pir/dialect/operator/ir/ops.yaml |   9 +
 paddle/phi/api/yaml/op_compat.yaml            |  35 +++
 paddle/phi/infermeta/multiary.cc              | 247 ++++++++++++++++++
 paddle/phi/infermeta/multiary.h               |  49 ++++
 test/white_list/new_ir_op_test_white_list     |   1 +
 6 files changed, 342 insertions(+)

diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 9f04a9b2fd4b2..e11b2ad1c1bf1 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -95,6 +95,7 @@
     'c_allreduce_max',
     'c_allgather',
     'seed',
+    "fused_attention",
 ]
 
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index 8babc4635b8fb..d3cbc31c2e490 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -170,3 +170,12 @@
   args : (Tensor i, Tensor x)
   output : Tensor[](out)
   backward: write_to_array_grad
+
+- op: fused_attention
+  args: (Tensor x, Tensor ln_scale, Tensor ln_bias, Tensor qkv_weight, Tensor qkv_bias, Tensor cache_kv, Tensor src_mask, Tensor out_linear_weight, Tensor out_linear_bias, Tensor ln_scale_2, Tensor ln_bias_2, int num_heads, bool transpose_qkv_wb, bool pre_layer_norm, float epsilon, float attn_dropout_rate, bool is_test, bool attn_dropout_fix_seed, int attn_dropout_seed, str attn_dropout_implementation, float dropout_rate, bool dropout_fix_seed, int dropout_seed, str dropout_implementation, float ln_epsilon, bool add_residual, int ring_id)
+  output: Tensor(ln_mean), Tensor(ln_var), Tensor(ln_out), Tensor(qkv_out), Tensor(qkv_bias_out), Tensor(transpose_out_2), Tensor(qk_out), Tensor(qktv_out), Tensor(softmax_out), Tensor(attn_dropout_mask_out), Tensor(attn_dropout_out), Tensor(src_mask_out), Tensor(fmha_out), Tensor(out_linear_out), Tensor(dropout_mask_out), Tensor(ln_mean_2), Tensor(ln_var_2), Tensor(bias_dropout_residual_out), Tensor(cache_kv_out), Tensor(out)
+  kernel:
+    func: fused_attention
+  infer_meta:
+    func: FusedAttentionInferMeta
+  optional: cache_kv, ln_scale, ln_bias, qkv_bias, src_mask, out_linear_bias, ln_scale_2, ln_bias_2
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 8a85147a66da0..63093631e4347 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1181,6 +1181,41 @@
       data_type : float
       support_tensor : true
 
+- op : fused_attention
+  inputs:
+    x: X
+    ln_scale: LnScale
+    ln_bias: LnBias
+    qkv_weight: QKVW
+    qkv_bias: QKVBias
+    cache_kv: CacheKV
+    src_mask: SrcMask
+    out_linear_weight: OutLinearW
+    out_linear_bias: OutLinearBias
+    ln_scale_2: Ln2Scale
+    ln_bias_2: Ln2Bias
+  outputs:
+    ln_mean: LnMean
+    ln_var: LnVariance
+    ln_out: LnOut
+    qkv_out: QKVOut
+    qkv_bias_out: QKVBiasOut
+    transpose_out_2: TransposeOut2
+    qk_out: QKOut
+    qktv_out: QKTVOut
+    softmax_out: SoftmaxOut
+    attn_dropout_mask_out: AttnDropoutMaskOut
+    attn_dropout_out: AttnDropoutOut
+    src_mask_out: SrcMaskOut
+    fmha_out: FMHAOut
+    out_linear_out: OutLinearOut
+    dropout_mask_out: DropoutMaskOut
+    ln_mean_2: Ln2Mean
+    ln_var_2: Ln2Variance
+    bias_dropout_residual_out: BiasDropoutResidualOut
+    cache_kv_out: CacheKVOut
+    out: Y
+
 - op : fused_batch_norm_act
   backward : fused_batch_norm_act_grad
   inputs:
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 8de465867273c..6b09dd22db263 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1604,6 +1604,253 @@ void FusedBiasActInferMeta(const MetaTensor& x,
   out->set_layout(x.layout());
 }
 
+void FusedAttentionInferMeta(const MetaTensor& x,
+                             const MetaTensor& ln_scale,
+                             const MetaTensor& ln_bias,
+                             const MetaTensor& qkv_weight,
+                             const MetaTensor& qkv_bias,
+                             const MetaTensor& cache_kv,
+                             const MetaTensor& src_mask,
+                             const MetaTensor& out_linear_weight,
+                             const MetaTensor& out_linear_bias,
+                             const MetaTensor& ln_scale_2,
+                             const MetaTensor& ln_bias_2,
+                             int num_heads,
+                             bool transpose_qkv_wb,
+                             bool pre_layer_norm,
+                             float epsilon,
+                             float attn_dropout_rate,
+                             bool is_test,
+                             bool attn_dropout_fix_seed,
+                             int attn_dropout_seed,
+                             const std::string& attn_dropout_implementation,
+                             float dropout_rate,
+                             bool dropout_fix_seed,
+                             int dropout_seed,
+                             const std::string& dropout_implementation,
+                             float ln_epsilon,
+                             bool add_residual,
+                             int ring_id,
+                             MetaTensor* ln_mean,
+                             MetaTensor* ln_var,
+                             MetaTensor* ln_out,
+                             MetaTensor* qkv_out,
+                             MetaTensor* qkv_bias_out,
+                             MetaTensor* transpose_out_2,
+                             MetaTensor* qk_out,
+                             MetaTensor* qktv_out,
+                             MetaTensor* softmax_out,
+                             MetaTensor* attn_dropout_mask_out,
+                             MetaTensor* attn_dropout_out,
+                             MetaTensor* src_mask_out,
+                             MetaTensor* fmha_out,
+                             MetaTensor* out_linear_out,
+                             MetaTensor* dropout_mask_out,
+                             MetaTensor* ln_mean_2,
+                             MetaTensor* ln_var_2,
+                             MetaTensor* bias_dropout_residual_out,
+                             MetaTensor* cache_kv_out,
+                             MetaTensor* out,
+                             MetaConfig config) {
+  auto x_dim = x.dims();
+  auto y_dim = qkv_weight.dims();
+
+  int dim_head = 0;
+  int hidden_size = 0;
+  int nranks = 1;
+  if (transpose_qkv_wb) {
+    PADDLE_ENFORCE_EQ(y_dim.size(),
+                      2,
+                      phi::errors::InvalidArgument(
+                          "The dimensions of qkv_weight must be 2 if enable"
+                          "transpose_qkv_wb: (dim_embed, 3 * dim_embed),"
+                          "but received dimensions of"
+                          "Input is [%d]",
+                          y_dim.size()));
+    PADDLE_ENFORCE_GT(num_heads,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "The num_heads must be provided and greater than 0 "
+                          "if enable transpose_qkv_wb, but we got %d.",
+                          num_heads));
+    PADDLE_ENFORCE_EQ(y_dim[0] % num_heads,
+                      0,
+                      phi::errors::InvalidArgument(
+                          "First dim of qkv_w must be divisible by num heads "
+                          "if enable transpose_qkv_wb, but receive first "
+                          "dim of qkv_w is %d and num_heads is %d.",
+                          y_dim[0],
+                          num_heads));
+    if (ring_id == -1) {
+      PADDLE_ENFORCE_EQ(
+          y_dim[0] * 3,
+          y_dim[1],
+          phi::errors::InvalidArgument("The dimensions of qkv_weight must be 2"
+                                       "(dim_embed, 3 * dim_embed)."));
+    } else {
+      // compute the mp nranks
+      nranks = (y_dim[0] * 3) / y_dim[1];
+    }
+    dim_head = y_dim[0] / (num_heads * nranks);
+    hidden_size = y_dim[0];
+  } else {
+    PADDLE_ENFORCE_EQ(y_dim.size(),
+                      4,
+                      phi::errors::InvalidArgument(
+                          "The dimensions of qkv_weight must be 4 if not"
+                          "enable transpose_qkv_wb: (3, num_head, dim_head, "
+                          "dim_embed), but received [%d]",
+                          y_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        y_dim[0],
+        3,
+        phi::errors::InvalidArgument("First dim of qkv_w must be 3 if disable "
+                                     "transpose_qkv_wb, but we got %d.",
+                                     y_dim[0]));
+    if (ring_id == -1) {
+      PADDLE_ENFORCE_EQ(
+          y_dim[1] * y_dim[2],
+          y_dim[3],
+          phi::errors::InvalidArgument("The dimensions of qkv_weight must be 4"
+                                       "(3, num_head, dim_head, dim_embed),"
+                                       "and must satisfy the limitations: "
+                                       "(num_head * dim_head == dim_embed)"));
+    }
+    num_heads = y_dim[1];
+    dim_head = y_dim[2];
+    hidden_size = y_dim[3];
+  }
+
+  PADDLE_ENFORCE_EQ(
+      x_dim.size(),
+      3,
+      phi::errors::InvalidArgument("The dimensions of x must be 3"
+                                   "(batch_size, seq_len, dim_embed),"
+                                   "but received dimensions of"
+                                   "Input is [%d]",
+                                   x_dim.size()));
+
+  PADDLE_ENFORCE_EQ(x_dim[2],
+                    hidden_size,
+                    phi::errors::InvalidArgument(
+                        "ShapeError: the dimension of x_dim[2] and y_dim[3] "
+                        "(y_dim[1] if enable transpose_qkv_w) "
+                        "must be equal. But received: the shape "
+                        "of input x = [%s], and the shape of "
+                        "input qkv_weight = [%s]",
+                        x_dim,
+                        y_dim));
+
+  if (pre_layer_norm) {
+    ln_mean->set_dims({x_dim[0] * x_dim[1]});
+    ln_var->set_dims({x_dim[0] * x_dim[1]});
+    ln_out->set_dims(x.dims());
+  } else {
+    ln_mean_2->set_dims({x_dim[0] * x_dim[1]});
+    ln_var_2->set_dims({x_dim[0] * x_dim[1]});
+    bias_dropout_residual_out->set_dims(x.dims());
+  }
+
+  if (transpose_qkv_wb) {
+    // [batch_size, seq_len, 3 * num_heads * dim_head]
+    qkv_out->set_dims({x_dim[0], x_dim[1], 3 * num_heads * dim_head});
+
+    if (qkv_bias) {
+      qkv_bias_out->set_dims({x_dim[0], x_dim[1], 3 * num_heads * dim_head});
+    }
+  } else {
+    // [batch_size, seq_len, 3, num_head, head_size]
+    qkv_out->set_dims({x_dim[0], x_dim[1], 3, num_heads, dim_head});
+
+    if (qkv_bias) {
+      qkv_bias_out->set_dims({x_dim[0], x_dim[1], 3, num_heads, dim_head});
+    }
+  }
+
+  // [3, batch_size, num_head, seq_len, head_size]
+  transpose_out_2->set_dims({3, x_dim[0], num_heads, x_dim[1], dim_head});
+
+  // cache_seq_len + seq_len if cache else seq_len
+  auto out_seq_len = x_dim[1];
+  if (cache_kv) {
+    // [2, batch_size, num_head, cache_seq_len, head_size]
+    auto c_dim = cache_kv.dims();
+
+    PADDLE_ENFORCE_EQ(
+        c_dim.size(),
+        5,
+        phi::errors::InvalidArgument("The CacheKV must be 5 dims, but got %d",
+                                     c_dim.size()));
+    PADDLE_ENFORCE_EQ(c_dim[0],
+                      2,
+                      phi::errors::InvalidArgument(
+                          "The first dim of CacheKV must be 2, but got %d",
+                          c_dim[0]));  // 2
+    PADDLE_ENFORCE_EQ(c_dim[1],
+                      x_dim[0],
+                      phi::errors::InvalidArgument(
+                          "The second dim of CacheKV must be equal with "
+                          "batch size %d, but got %d",
+                          x_dim[0],
+                          c_dim[1]));  // batch_size
+    PADDLE_ENFORCE_EQ(c_dim[2],
+                      num_heads,
+                      phi::errors::InvalidArgument(
+                          "The third dim of CacheKV must be equal with num "
+                          "head %d, but got %d",
+                          num_heads,
+                          c_dim[2]));  // num_head
+    // In compile stage, input seq_len can be -1, in that case
+    // c_dim[3] may < 0 in while
+    if (config.is_runtime) {
+      PADDLE_ENFORCE_GE(
+          c_dim[3],
+          0,
+          phi::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+    }
+
+    PADDLE_ENFORCE_EQ(c_dim[4],
+                      dim_head,
+                      phi::errors::InvalidArgument(
+                          "The fifth dim of CacheKV must be equal with head "
+                          "size %d, but got %d",
+                          dim_head,
+                          c_dim[4]));  // head_size
+
+    out_seq_len += c_dim[3];
+    // [3, batch_size, num_head, cache_seq_len + seq_len, head_size]
+    cache_kv_out->set_dims(
+        {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]});
+  }
+  // [batch, num_head, seq_len, out_seq_len]
+  qk_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len});
+
+  if (src_mask) {
+    src_mask_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len});
+  }
+  // the same as QKOut's shape.
+  attn_dropout_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len});
+  if (is_test) {
+    attn_dropout_mask_out->set_dims(
+        {x_dim[0], num_heads, x_dim[1], out_seq_len});
+  }
+  softmax_out->set_dims({x_dim[0], num_heads, x_dim[1], out_seq_len});
+  // [batch_size, num_heads, seq_len, head_dim]
+  qktv_out->set_dims({x_dim[0], num_heads, x_dim[1], dim_head});
+  // [batch_size, seq_len, number of heads*head size]
+  fmha_out->set_dims({x_dim[0], x_dim[1], num_heads, dim_head});
+
+  out_linear_out->set_dims(x.dims());
+
+  if (is_test == false) {
+    dropout_mask_out->set_dims(x.dims());
+  }
+
+  out->set_dims(x.dims());
+}
+
 void FusedLayerNormInferMeta(const MetaTensor& x,
                              const MetaTensor& bias,
                              const MetaTensor& residual,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index ee62d6d51d655..aaa4787968538 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -334,6 +334,55 @@ void FusedBiasActInferMeta(const MetaTensor& x,
                            MetaTensor* out,
                            MetaConfig config = MetaConfig());
 
+void FusedAttentionInferMeta(const MetaTensor& x,
+                             const MetaTensor& ln_scale,
+                             const MetaTensor& ln_bias,
+                             const MetaTensor& qkv_weight,
+                             const MetaTensor& qkv_bias,
+                             const MetaTensor& cache_kv,
+                             const MetaTensor& src_mask,
+                             const MetaTensor& out_linear_weight,
+                             const MetaTensor& out_linear_bias,
+                             const MetaTensor& ln_scale_2,
+                             const MetaTensor& ln_bias_2,
+                             int num_heads,
+                             bool transpose_qkv_wb,
+                             bool pre_layer_norm,
+                             float epsilon,
+                             float attn_dropout_rate,
+                             bool is_test,
+                             bool attn_dropout_fix_seed,
+                             int attn_dropout_seed,
+                             const std::string& attn_dropout_implementation,
+                             float dropout_rate,
+                             bool dropout_fix_seed,
+                             int dropout_seed,
+                             const std::string& dropout_implementation,
+                             float ln_epsilon,
+                             bool add_residual,
+                             int ring_id,
+                             MetaTensor* ln_mean,
+                             MetaTensor* ln_var,
+                             MetaTensor* ln_out,
+                             MetaTensor* qkv_out,
+                             MetaTensor* qkv_bias_out,
+                             MetaTensor* transpose_out_2,
+                             MetaTensor* qk_out,
+                             MetaTensor* qktv_out,
+                             MetaTensor* softmax_out,
+                             MetaTensor* attn_dropout_mask_out,
+                             MetaTensor* attn_dropout_out,
+                             MetaTensor* src_mask_out,
+                             MetaTensor* fmha_out,
+                             MetaTensor* out_linear_out,
+                             MetaTensor* dropout_mask_out,
+                             MetaTensor* ln_mean_2,
+                             MetaTensor* ln_var_2,
+                             MetaTensor* bias_dropout_residual_out,
+                             MetaTensor* cache_kv_out,
+                             MetaTensor* out,
+                             MetaConfig config = MetaConfig());
+
 void FusedLayerNormInferMeta(const MetaTensor& x,
                              const MetaTensor& bias,
                              const MetaTensor& residual,
diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list
index b85c88fa6bb18..3dc336a187718 100644
--- a/test/white_list/new_ir_op_test_white_list
+++ b/test/white_list/new_ir_op_test_white_list
@@ -88,6 +88,7 @@ test_fmax_op
 test_fmin_op
 test_fold_op
 test_frame_op
+test_fused_attention_op_api
 test_gather_tree_op
 test_gaussian_random_op
 test_generate_proposals_v2_op

From 1c7c6879d8bf4ad01db3ef0da61eccb52e43a180 Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Fri, 22 Sep 2023 08:30:17 +0800
Subject: [PATCH 041/115] [PIR] support verify in trait or interface. (#57550)

* [PIR] support verify in trait or interface.

* [PIR] fix typo error
---
 paddle/fluid/pybind/ir.cc            |  14 +-
 paddle/pir/core/op_base.h            |  15 +-
 paddle/pir/core/op_result.h          |   2 +-
 paddle/pir/core/operation_utils.h    |   6 +
 test/cpp/pir/core/CMakeLists.txt     |   9 +-
 test/cpp/pir/core/ir_op_test.cc      | 206 ++++-----------------------
 test/cpp/pir/tools/CMakeLists.txt    |   2 +-
 test/cpp/pir/tools/test_dialect.cc   |  19 ++-
 test/cpp/pir/tools/test_dialect.h    |   7 +-
 test/cpp/pir/tools/test_interface.cc |  15 ++
 test/cpp/pir/tools/test_interface.h  |  65 +++++++++
 test/cpp/pir/tools/test_op.cc        |  34 ++++-
 test/cpp/pir/tools/test_op.h         |  36 ++++-
 test/cpp/pir/tools/test_trait.cc     |  30 ++++
 test/cpp/pir/tools/test_trait.h      |  37 +++++
 15 files changed, 291 insertions(+), 206 deletions(-)
 create mode 100644 test/cpp/pir/tools/test_interface.cc
 create mode 100644 test/cpp/pir/tools/test_interface.h
 create mode 100644 test/cpp/pir/tools/test_trait.cc
 create mode 100644 test/cpp/pir/tools/test_trait.h

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 80ecad93997db..e5e237bf7fe34 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -638,7 +638,7 @@ Operation *BuildOpFrom(
                  std::back_inserter(to_create_argument.inputs),
                  [&value_map](const pir::OpOperand &operand) {
                    // Operand -> OpResult
-                   return OpResult::dyn_cast_from(value_map[operand.source()]);
+                   return value_map[operand.source()];
                  });
   auto *cloned_op = Operation::Create(std::move(to_create_argument));
 
@@ -834,11 +834,8 @@ SplitedResult ForwardBackwardSplit(
          pir::StrAttribute::get(
              ctx, std::string("output_") + std::to_string(counter))},
     };
-    pir::Operation *operation =
-        pir::Operation::Create({OpResult::dyn_cast_from(forward_value_map[v])},
-                               attribute_map,
-                               {},
-                               op_info);
+    pir::Operation *operation = pir::Operation::Create(
+        {forward_value_map[v]}, attribute_map, {}, op_info);
     forward_program->block()->push_back(operation);
     counter += 1;
   };
@@ -857,10 +854,7 @@ SplitedResult ForwardBackwardSplit(
              ctx, std::string("output_") + std::to_string(counter))},
     };
     pir::Operation *operation = pir::Operation::Create(
-        {OpResult::dyn_cast_from(backward_value_map.at(v))},
-        attribute_map,
-        {},
-        op_info);
+        {backward_value_map.at(v)}, attribute_map, {}, op_info);
     backward_program->block()->push_back(operation);
     counter += 1;
   };
diff --git a/paddle/pir/core/op_base.h b/paddle/pir/core/op_base.h
index 538b48bed6a9c..314dbe3f3706e 100644
--- a/paddle/pir/core/op_base.h
+++ b/paddle/pir/core/op_base.h
@@ -103,6 +103,18 @@ class OpInterfaceBase : public OpBase {
   }
 };
 
+template <typename, typename = void>
+struct VerifyTraitOrInterface {
+  static void call(Operation *) {}
+};
+
+template <typename T>
+struct VerifyTraitOrInterface<T,
+                              decltype(T::Verify(
+                                  std::declval<Operation *>()))> {
+  static void call(Operation *op) { T::Verify(op); }
+};
+
 template <typename ConcreteOp, class... TraitOrInterface>
 class Op : public OpBase {
  public:
@@ -139,12 +151,13 @@ class Op : public OpBase {
     class EmptyOp : public Op<EmptyOp, TraitOrInterface...> {};
     return sizeof(ConcreteOp) == sizeof(EmptyOp);
   }
-
   // Implementation of `VerifyInvariantsFn` OperationName hook.
   static void VerifyInvariants(Operation *op) {
     static_assert(HasNoDataMembers(),
                   "Op class shouldn't define new data members");
     op->dyn_cast<ConcreteOp>().Verify();
+    (void)std::initializer_list<int>{
+        0, (VerifyTraitOrInterface<TraitOrInterface>::call(op), 0)...};
   }
 };
 
diff --git a/paddle/pir/core/op_result.h b/paddle/pir/core/op_result.h
index 781ed93148103..8860473fe3339 100644
--- a/paddle/pir/core/op_result.h
+++ b/paddle/pir/core/op_result.h
@@ -32,7 +32,6 @@ class IR_API OpResult : public Value {
   Operation *owner() const;
   uint32_t index() const;
   bool operator==(const OpResult &other) const;
-  static OpResult dyn_cast_from(Value value);
 
  private:
   friend Operation;
@@ -40,6 +39,7 @@ class IR_API OpResult : public Value {
   // Access classof annd dyn_cast_from.
   friend Value;
   static bool classof(Value value);
+  static OpResult dyn_cast_from(Value value);
 };
 
 }  // namespace pir
diff --git a/paddle/pir/core/operation_utils.h b/paddle/pir/core/operation_utils.h
index c868731ca4753..36dcca7bd0d53 100644
--- a/paddle/pir/core/operation_utils.h
+++ b/paddle/pir/core/operation_utils.h
@@ -84,6 +84,12 @@ struct OperationArgument {
   /// Add an array of named attributes.
   template <class InputIt>
   void AddAttributes(InputIt first, InputIt last);
+
+  template <class AttrContainer>
+  void AddAttributes(const AttrContainer& attr_container) {
+    AddAttributes(std::begin(attr_container), std::end(attr_container));
+  }
+
   /// Get the context held by this operation state.
   IrContext* getContext() const { return info.ir_context(); }
 
diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
index a131f84fe313c..355738d3baef5 100644
--- a/test/cpp/pir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -8,7 +8,14 @@ cc_test_old(
   pd_op_dialect)
 cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir gtest)
 cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS pir gtest)
-cc_test_old(ir_op_test SRCS ir_op_test.cc DEPS pir gtest)
+cc_test_old(
+  ir_op_test
+  SRCS
+  ir_op_test.cc
+  DEPS
+  pir
+  gtest
+  test_dialect)
 cc_test_old(ir_region_test SRCS ir_region_test.cc DEPS pir gtest)
 cc_test_old(ir_builder_test SRCS ir_builder_test.cc DEPS pir gtest)
 cc_test_old(
diff --git a/test/cpp/pir/core/ir_op_test.cc b/test/cpp/pir/core/ir_op_test.cc
index bfc03e66944e9..0a5317c36cc4f 100644
--- a/test/cpp/pir/core/ir_op_test.cc
+++ b/test/cpp/pir/core/ir_op_test.cc
@@ -27,49 +27,8 @@
 #include "paddle/pir/core/op_base.h"
 #include "paddle/pir/core/program.h"
 #include "paddle/pir/core/region.h"
-
-/// \brief Define built-in Trait, derived from OpTraitBase.
-class ReadOnlyTrait : public pir::OpTraitBase<ReadOnlyTrait> {
- public:
-  explicit ReadOnlyTrait(pir::Operation *op)
-      : pir::OpTraitBase<ReadOnlyTrait>(op) {}
-};
-IR_DECLARE_EXPLICIT_TYPE_ID(ReadOnlyTrait)
-IR_DEFINE_EXPLICIT_TYPE_ID(ReadOnlyTrait)
-
-/// \brief Define built-in Interface, derived from OpInterfaceBase. Concepts and
-/// Models need to be defined within the class. Concept defines abstract
-/// interface functions, and Model is a template class that defines the specific
-/// implementation of interface functions based on template parameters.
-class InferShapeInterface : public pir::OpInterfaceBase<InferShapeInterface> {
- public:
-  struct Concept {
-    explicit Concept(void (*infer_shape)(pir::Operation *))
-        : infer_shape_(infer_shape) {}
-    void (*infer_shape_)(pir::Operation *);
-  };
-
-  template <class ConcreteOp>
-  struct Model : public Concept {
-    static void InferShape(pir::Operation *op) {
-      ConcreteOp concret_op = ConcreteOp(op);
-      if (concret_op == nullptr) throw("concret_op is nullptr");
-      concret_op.InferShape();
-    }
-
-    Model() : Concept(InferShape) {}
-  };
-
-  InferShapeInterface(pir::Operation *op, Concept *impl)
-      : pir::OpInterfaceBase<InferShapeInterface>(op), impl_(impl) {}
-
-  void InferShape() { impl_->infer_shape_(operation()); }
-
- private:
-  Concept *impl_;
-};
-IR_DECLARE_EXPLICIT_TYPE_ID(InferShapeInterface)
-IR_DEFINE_EXPLICIT_TYPE_ID(InferShapeInterface)
+#include "test/cpp/pir/tools/test_dialect.h"
+#include "test/cpp/pir/tools/test_op.h"
 
 pir::AttributeMap CreateAttributeMap(
     const std::vector<std::string> &attribute_names,
@@ -84,139 +43,15 @@ pir::AttributeMap CreateAttributeMap(
   return attr_map;
 }
 
-// Define op1.
-class Operation1 : public pir::Op<Operation1> {
- public:
-  using Op::Op;
-  static const char *name() { return "test.operation1"; }
-  static constexpr uint32_t attributes_num = 2;
-  static const char *attributes_name[attributes_num];  // NOLINT
-  void Verify() {
-    auto &attributes = this->attributes();
-    if (attributes.count("op1_attr1") == 0 ||
-        !attributes.at("op1_attr1").isa<pir::StrAttribute>()) {
-      throw("Type of attribute: parameter_name is not right.");
-    }
-    if (attributes.count("op1_attr2") == 0 ||
-        !attributes.at("op1_attr2").isa<pir::StrAttribute>()) {
-      throw("Type of attribute: parameter_name is not right.");
-    }
-  }
-  static void Build(const pir::Builder &builder,
-                    pir::OperationArgument &argument) {  // NOLINT
-    std::vector<pir::Type> output_types = {
-        pir::Float32Type::get(builder.ir_context())};
-    std::unordered_map<std::string, pir::Attribute> attributes =
-        CreateAttributeMap({"op1_attr1", "op1_attr2"},
-                           {"op1_attr1", "op1_attr2"});
-    argument.AddOutputs(output_types.begin(), output_types.end());
-    argument.AddAttributes(attributes.begin(), attributes.end());
-  }
-};
-const char *Operation1::attributes_name[attributes_num] = {  // NOLINT
-    "op1_attr1",
-    "op1_attr2"};
-
-IR_DECLARE_EXPLICIT_TYPE_ID(Operation1)
-IR_DEFINE_EXPLICIT_TYPE_ID(Operation1)
-
-// Define op2.
-class Operation2
-    : public pir::Op<Operation2, ReadOnlyTrait, InferShapeInterface> {
- public:
-  using Op::Op;
-  static const char *name() { return "test.operation2"; }
-  static constexpr uint32_t attributes_num = 2;
-  static const char *attributes_name[attributes_num];  // NOLINT
-  void Verify() {
-    auto &attributes = this->attributes();
-    if (attributes.count("op2_attr1") == 0 ||
-        (!attributes.at("op2_attr1").isa<pir::StrAttribute>())) {
-      throw("Type of attribute: parameter_name is not right.");
-    }
-    if (attributes.count("op2_attr2") == 0 ||
-        (!attributes.at("op2_attr2").isa<pir::StrAttribute>())) {
-      throw("Type of attribute: parameter_name is not right.");
-    }
-  }
-  static void InferShape() { VLOG(2) << "This is op2's InferShape interface."; }
-};
-const char *Operation2::attributes_name[attributes_num] = {  // NOLINT
-    "op2_attr1",
-    "op2_attr2"};
-IR_DECLARE_EXPLICIT_TYPE_ID(Operation2)
-IR_DEFINE_EXPLICIT_TYPE_ID(Operation2)
-
-// Define a dialect, op1 and op2 will be registered by this dialect.
-class TestDialect : public pir::Dialect {
- public:
-  explicit TestDialect(pir::IrContext *context)
-      : pir::Dialect(name(), context, pir::TypeId::get<TestDialect>()) {
-    initialize();
-  }
-  static const char *name() { return "test"; }
-
-  void PrintOperation(pir::Operation *op,
-                      pir::IrPrinter &printer) const override {
-    printer.PrintOpResult(op);
-    printer.os << " =";
-
-    printer.os << " \"" << op->name() << "\"";
-    printer.PrintOpOperands(op);
-  }
-
- private:
-  void initialize() { RegisterOps<Operation1, Operation2>(); }
-};
-IR_DECLARE_EXPLICIT_TYPE_ID(TestDialect)
-IR_DEFINE_EXPLICIT_TYPE_ID(TestDialect)
-
-TEST(op_test, op_test) {
-  // (1) Register Dialect, Operation1, Operation2 into IrContext.
-  pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Dialect *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
-  EXPECT_EQ(test_dialect != nullptr, true);
-
-  // (2) Get registered operations.
-  std::string op1_name = Operation1::name();
-  pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(op1_name);
-  EXPECT_TRUE(op1_info);
-  std::string op2_name = Operation2::name();
-  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo(op2_name);
-  EXPECT_TRUE(op2_info);
-  EXPECT_EQ(op1_info.HasTrait<ReadOnlyTrait>(), false);
-  EXPECT_EQ(op1_info.HasInterface<InferShapeInterface>(), false);
-  EXPECT_EQ(op2_info.HasTrait<ReadOnlyTrait>(), true);
-  EXPECT_EQ(op2_info.HasInterface<InferShapeInterface>(), true);
-
-  // (3) Test uses for op.
-  std::vector<pir::Value> op_inputs = {};
-  std::vector<pir::Type> op_output_types = {pir::Float32Type::get(ctx)};
-  pir::Operation *op2 =
-      pir::Operation::Create(op_inputs,
-                             CreateAttributeMap({"op2_attr1", "op2_attr2"},
-                                                {"op2_attr1", "op2_attr2"}),
-                             op_output_types,
-                             op2_info);
-
-  ReadOnlyTrait trait = op2->dyn_cast<ReadOnlyTrait>();
-  EXPECT_EQ(trait.operation(), op2);
-  InferShapeInterface interface = op2->dyn_cast<InferShapeInterface>();
-  interface.InferShape();
-  Operation2 Op2 = op2->dyn_cast<Operation2>();
-  EXPECT_EQ(Op2.operation(), op2);
-  op2->Destroy();
-}
-
 TEST(op_test, region_test) {
   // (1) Register Dialect, Operation1, Operation2 into IrContext.
   pir::IrContext *ctx = pir::IrContext::Instance();
-  pir::Dialect *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
+  pir::Dialect *test_dialect = ctx->GetOrRegisterDialect<test::TestDialect>();
   EXPECT_EQ(test_dialect != nullptr, true);
 
   // (2) Get registered operations.
-  pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(Operation1::name());
-  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo(Operation2::name());
+  pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(test::Operation1::name());
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo(test::Operation2::name());
 
   pir::Operation *op1 =
       pir::Operation::Create({},
@@ -224,16 +59,10 @@ TEST(op_test, region_test) {
                                                 {"op1_attr1", "op1_attr2"}),
                              {pir::Float32Type::get(ctx)},
                              op1_info);
-  pir::Operation *op1_2 =
-      pir::Operation::Create({},
-                             CreateAttributeMap({"op1_attr1", "op1_attr2"},
-                                                {"op1_attr1", "op1_attr2"}),
-                             {pir::Float32Type::get(ctx)},
-                             op1_info);
+  pir::Operation *op_2 =
+      pir::Operation::Create({}, {}, {pir::Float32Type::get(ctx)}, op2_info);
 
   pir::OperationArgument argument(op2_info);
-  argument.attributes = CreateAttributeMap({"op2_attr1", "op2_attr2"},
-                                           {"op2_attr1", "op2_attr2"});
   argument.output_types = {pir::Float32Type::get(ctx)};
   argument.num_regions = 1;
 
@@ -252,7 +81,7 @@ TEST(op_test, region_test) {
   region.insert(region.begin(), new pir::Block());
   pir::Block *block = region.front();
   block->push_front(op1);
-  block->insert(block->begin(), op1_2);
+  block->insert(block->begin(), op_2);
   op3->Destroy();
 }
 
@@ -279,3 +108,22 @@ TEST(op_test, module_op_death) {
   program.module_op()->set_attribute("program",
                                      pir::PointerAttribute::get(ctx, &program));
 }
+
+TEST(op_test, trait_and_interface) {
+  pir::IrContext ctx;
+  ctx.GetOrRegisterDialect<test::TestDialect>();
+  pir::Program program(&ctx);
+  auto block = program.block();
+  pir::Builder builder(&ctx, block);
+  auto op1 = builder.Build<test::Operation1>();
+  auto op2 = builder.Build<test::Operation2>();
+
+  EXPECT_EQ(op1->HasTrait<test::ReadOnlyTrait>(), false);
+  EXPECT_EQ(op1->HasInterface<test::InferShapeInterface>(), false);
+  EXPECT_EQ(op2->HasTrait<test::ReadOnlyTrait>(), true);
+  EXPECT_EQ(op2->HasInterface<test::InferShapeInterface>(), true);
+
+  pir::OperationArgument argument(&ctx, "test.region");
+  argument.num_regions = 2u;
+  EXPECT_THROW(builder.Build(argument), pir::IrNotMetException);
+}
diff --git a/test/cpp/pir/tools/CMakeLists.txt b/test/cpp/pir/tools/CMakeLists.txt
index 64e5b97243620..5a1f073698833 100644
--- a/test/cpp/pir/tools/CMakeLists.txt
+++ b/test/cpp/pir/tools/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   test_dialect
-  SRCS test_dialect.cc test_op.cc
+  SRCS test_dialect.cc test_op.cc test_trait.cc test_interface.cc
   DEPS pir)
diff --git a/test/cpp/pir/tools/test_dialect.cc b/test/cpp/pir/tools/test_dialect.cc
index bf94e8db3dce1..49fb4a6951dd7 100644
--- a/test/cpp/pir/tools/test_dialect.cc
+++ b/test/cpp/pir/tools/test_dialect.cc
@@ -12,8 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "test/cpp/pir/tools/test_dialect.h"
+#include "paddle/pir/core/ir_printer.h"
 #include "test/cpp/pir/tools/test_op.h"
 namespace test {
-void TestDialect::initialize() { RegisterOps<RegionOp, BranchOp>(); }
+
+TestDialect::TestDialect(pir::IrContext *context)
+    : pir::Dialect(name(), context, pir::TypeId::get<TestDialect>()) {
+  initialize();
+}
+void TestDialect::initialize() {
+  RegisterOps<RegionOp, BranchOp, Operation1, Operation2>();
+}
+
+void TestDialect::PrintOperation(pir::Operation *op,
+                                 pir::IrPrinter &printer) const {
+  printer.PrintOpResult(op);
+  printer.os << " =";
+
+  printer.os << " \"" << op->name() << "\"";
+  printer.PrintOpOperands(op);
+}
 }  // namespace test
 IR_DEFINE_EXPLICIT_TYPE_ID(test::TestDialect)
diff --git a/test/cpp/pir/tools/test_dialect.h b/test/cpp/pir/tools/test_dialect.h
index 8b259c5563c4b..c3594273b5355 100644
--- a/test/cpp/pir/tools/test_dialect.h
+++ b/test/cpp/pir/tools/test_dialect.h
@@ -19,11 +19,10 @@
 namespace test {
 class TestDialect : public pir::Dialect {
  public:
-  explicit TestDialect(pir::IrContext *context)
-      : pir::Dialect(name(), context, pir::TypeId::get<TestDialect>()) {
-    initialize();
-  }
+  explicit TestDialect(pir::IrContext *context);
   static const char *name() { return "test"; }
+  void PrintOperation(pir::Operation *op,
+                      pir::IrPrinter &printer) const override;
 
  private:
   void initialize();
diff --git a/test/cpp/pir/tools/test_interface.cc b/test/cpp/pir/tools/test_interface.cc
new file mode 100644
index 0000000000000..b0d72b48baa20
--- /dev/null
+++ b/test/cpp/pir/tools/test_interface.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "test/cpp/pir/tools/test_interface.h"
+IR_DEFINE_EXPLICIT_TYPE_ID(test::InferShapeInterface)
diff --git a/test/cpp/pir/tools/test_interface.h b/test/cpp/pir/tools/test_interface.h
new file mode 100644
index 0000000000000..a2de7e1bb6972
--- /dev/null
+++ b/test/cpp/pir/tools/test_interface.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <gtest/gtest.h>
+#include <sstream>
+
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/ir_printer.h"
+#include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/region.h"
+
+namespace test {
+/// \brief Define built-in Interface, derived from OpInterfaceBase. Concepts and
+/// Models need to be defined within the class. Concept defines abstract
+/// interface functions, and Model is a template class that defines the specific
+/// implementation of interface functions based on template parameters.
+class InferShapeInterface : public pir::OpInterfaceBase<InferShapeInterface> {
+ public:
+  struct Concept {
+    explicit Concept(void (*infer_shape)(pir::Operation *))
+        : infer_shape(infer_shape) {}
+    void (*infer_shape)(pir::Operation *);
+  };
+
+  template <class ConcreteOp>
+  struct Model : public Concept {
+    static void InferShape(pir::Operation *op) {
+      ConcreteOp concret_op = ConcreteOp(op);
+      if (concret_op == nullptr) throw("concret_op is nullptr");
+      concret_op.InferShape();
+    }
+
+    Model() : Concept(InferShape) {}
+  };
+
+  InferShapeInterface(pir::Operation *op, Concept *impl)
+      : pir::OpInterfaceBase<InferShapeInterface>(op), impl_(impl) {}
+
+  void InferShape() { impl_->infer_shape(operation()); }
+
+ private:
+  Concept *impl_;
+};
+
+}  // namespace test
+IR_DECLARE_EXPLICIT_TYPE_ID(test::InferShapeInterface)
diff --git a/test/cpp/pir/tools/test_op.cc b/test/cpp/pir/tools/test_op.cc
index 9adce7ea402e9..99515ecf2e2e1 100644
--- a/test/cpp/pir/tools/test_op.cc
+++ b/test/cpp/pir/tools/test_op.cc
@@ -12,17 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "test/cpp/pir/tools/test_op.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/enforce.h"
 
 namespace test {
+
 void RegionOp::Build(pir::Builder &builder, pir::OperationArgument &argument) {
   argument.num_regions = 1;
 }
-void RegionOp::Verify() const {
-  auto num_regions = (*this)->num_regions();
-  IR_ENFORCE(num_regions == 1u,
-             "The region's number in Region Op must be 1, but current is %d",
-             num_regions);
-}
 
 void BranchOp::Build(pir::Builder &builder,  // NOLINT
                      pir::OperationArgument &argument,
@@ -38,7 +35,32 @@ void BranchOp::Verify() const {
   IR_ENFORCE((*this)->successor(0), "successor[0] can't be nullptr");
 }
 
+const char *Operation1::attributes_name[2] = {  // NOLINT
+    "op1_attr1",
+    "op1_attr2"};
+
+void Operation1::Build(pir::Builder &builder,               // NOLINT
+                       pir::OperationArgument &argument) {  // NOLINT
+  std::unordered_map<std::string, pir::Attribute> attributes{
+      {"op1_attr1", builder.str_attr("op1_attr2")},
+      {"op1_attr2", builder.str_attr("op1_attr2")}};
+  argument.AddOutput(builder.float32_type());
+  argument.AddAttributes(attributes);
+}
+void Operation1::Verify() const {
+  auto &attributes = this->attributes();
+  if (attributes.count("op1_attr1") == 0 ||
+      !attributes.at("op1_attr1").isa<pir::StrAttribute>()) {
+    throw("Type of attribute: parameter_name is not right.");
+  }
+  if (attributes.count("op1_attr2") == 0 ||
+      !attributes.at("op1_attr2").isa<pir::StrAttribute>()) {
+    throw("Type of attribute: parameter_name is not right.");
+  }
+}
 }  // namespace test
 
 IR_DEFINE_EXPLICIT_TYPE_ID(test::RegionOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(test::BranchOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::Operation1)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::Operation2)
diff --git a/test/cpp/pir/tools/test_op.h b/test/cpp/pir/tools/test_op.h
index 9e0f9f1e933b2..8d4ccd49a38ed 100644
--- a/test/cpp/pir/tools/test_op.h
+++ b/test/cpp/pir/tools/test_op.h
@@ -15,13 +15,17 @@
 #pragma once
 
 #include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_type.h"
 #include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/operation_utils.h"
+#include "test/cpp/pir/tools/test_interface.h"
+#include "test/cpp/pir/tools/test_trait.h"
 
 namespace test {
 ///
 /// \brief TestRegionOp
 ///
-class RegionOp : public pir::Op<RegionOp> {
+class RegionOp : public pir::Op<RegionOp, OneRegionTrait> {
  public:
   using Op::Op;
   static const char *name() { return "test.region"; }
@@ -29,7 +33,7 @@ class RegionOp : public pir::Op<RegionOp> {
   static constexpr const char **attributes_name = nullptr;
   static void Build(pir::Builder &builder,              // NOLINT
                     pir::OperationArgument &argument);  // NOLINT
-  void Verify() const;
+  void Verify() const {}
 };
 
 ///
@@ -48,7 +52,35 @@ class BranchOp : public pir::Op<BranchOp> {
   void Verify() const;
 };
 
+// Define case op1.
+class Operation1 : public pir::Op<Operation1> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.operation1"; }
+  static constexpr uint32_t attributes_num = 2;
+  static const char *attributes_name[attributes_num];   // NOLINT
+  static void Build(pir::Builder &builder,              // NOLINT
+                    pir::OperationArgument &argument);  // NOLINT
+  void Verify() const;
+};
+
+// Define op2.
+class Operation2
+    : public pir::Op<Operation2, ReadOnlyTrait, InferShapeInterface> {
+ public:
+  using Op::Op;
+  static const char *name() { return "test.operation2"; }
+  static constexpr uint32_t attributes_num = 0;
+  static constexpr const char **attributes_name = nullptr;  // NOLINT
+  static void Build(pir::Builder &builder,                  // NOLINT
+                    pir::OperationArgument &argument) {}    // NOLINT
+  void Verify() const {}
+  static void InferShape() { VLOG(2) << "This is op2's InferShape interface."; }
+};
+
 }  // namespace test
 
 IR_DECLARE_EXPLICIT_TYPE_ID(test::RegionOp)
 IR_DECLARE_EXPLICIT_TYPE_ID(test::BranchOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::Operation1)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::Operation2)
diff --git a/test/cpp/pir/tools/test_trait.cc b/test/cpp/pir/tools/test_trait.cc
new file mode 100644
index 0000000000000..1fa5dd0bba911
--- /dev/null
+++ b/test/cpp/pir/tools/test_trait.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "test/cpp/pir/tools/test_trait.h"
+#include "glog/logging.h"
+
+#include "paddle/pir/core/enforce.h"
+
+namespace test {
+void OneRegionTrait::Verify(pir::Operation *op) {
+  VLOG(1) << "here";
+  IR_ENFORCE(op->num_regions() == 1u,
+             "%s op has one region trait, but its region size is %u",
+             op->name(),
+             op->num_regions());
+}
+}  // namespace test
+
+IR_DEFINE_EXPLICIT_TYPE_ID(test::ReadOnlyTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(test::OneRegionTrait)
diff --git a/test/cpp/pir/tools/test_trait.h b/test/cpp/pir/tools/test_trait.h
new file mode 100644
index 0000000000000..cc002081dddc2
--- /dev/null
+++ b/test/cpp/pir/tools/test_trait.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <gtest/gtest.h>
+#include <sstream>
+
+#include "paddle/pir/core/op_base.h"
+
+namespace test {
+
+class ReadOnlyTrait : public pir::OpTraitBase<ReadOnlyTrait> {
+ public:
+  explicit ReadOnlyTrait(pir::Operation *op)
+      : pir::OpTraitBase<ReadOnlyTrait>(op) {}
+};
+
+class OneRegionTrait : public pir::OpTraitBase<OneRegionTrait> {
+ public:
+  explicit OneRegionTrait(pir::Operation *op)
+      : pir::OpTraitBase<OneRegionTrait>(op) {}
+  static void Verify(pir::Operation *op);
+};
+
+}  // namespace test
+IR_DECLARE_EXPLICIT_TYPE_ID(test::ReadOnlyTrait)
+IR_DECLARE_EXPLICIT_TYPE_ID(test::OneRegionTrait)

From 71704ae4288371a65090d1cb03adce38f8cc6115 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Fri, 22 Sep 2023 09:22:38 +0800
Subject: [PATCH 042/115] speed up in_dynamic_or_pir_mode (#57586)

---
 python/paddle/base/framework.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index d3f17ea6435e9..83f3ee734b8f2 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -345,7 +345,7 @@ def in_dynamic_or_pir_mode():
             True
 
     """
-    return in_dygraph_mode() or in_pir_mode()
+    return global_var._dygraph_tracer_ is not None or global_var._use_pir_api_
 
 
 global_ipu_index = -1

From c2ea73e5c1ec5e0c3c2d718096c0598505f14a57 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Fri, 22 Sep 2023 09:31:53 +0800
Subject: [PATCH 043/115] =?UTF-8?q?=E3=80=90PIR=E3=80=91modify=20Subtract?=
 =?UTF-8?q?=20optest=20(#57608)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify ci bug

* add sub test

* modify pd name
---
 paddle/fluid/pir/transforms/inplace_pass.cc |  3 +-
 python/paddle/autograd/ir_backward.py       |  2 +-
 test/legacy_test/test_elementwise_sub_op.py | 52 +++++++++++++++------
 3 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/pir/transforms/inplace_pass.cc b/paddle/fluid/pir/transforms/inplace_pass.cc
index 3f419f20fb99a..6010af208fae6 100644
--- a/paddle/fluid/pir/transforms/inplace_pass.cc
+++ b/paddle/fluid/pir/transforms/inplace_pass.cc
@@ -204,7 +204,8 @@ static std::unordered_map<pir::Operation*, std::string> GetInplaceOps(
     // NOTE(zhangbo): add_grad cpu kernel can't do inplace, for the reason shown
     // in the function: CommonElementwiseBroadcastBackward
     // (paddle/phi/kernels/funcs/elementwise_grad_base.h)
-    if ((upper_op_name == "pd_op.add_grad") &&
+    if ((upper_op_name == "pd_op.add_grad" ||
+         upper_op_name == "pd_op.subtract_grad") &&
         (upper_op_attrs.at("kernel_key")
              .dyn_cast<paddle::dialect::KernelAttribute>()
              .data()
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index f8a2aae71b0cd..bb73c1d670cee 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -288,7 +288,7 @@ def inverse_sort_op(ops):
     sorted_list = []
     for op in ops:
         for x in op.operands():
-            if x.source().get_defining_op() in ops_set:
+            if x.source() and x.source().get_defining_op() in ops_set:
                 pending_count[x.source().get_defining_op()] += 1
 
     queue = collections.deque()
diff --git a/test/legacy_test/test_elementwise_sub_op.py b/test/legacy_test/test_elementwise_sub_op.py
index d3e96d158d98c..62176d28595fc 100644
--- a/test/legacy_test/test_elementwise_sub_op.py
+++ b/test/legacy_test/test_elementwise_sub_op.py
@@ -44,10 +44,12 @@ def init_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', check_prim=self.check_prim)
+        self.check_grad(
+            ['X', 'Y'], 'Out', check_prim=self.check_prim, check_new_ir=True
+        )
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -56,6 +58,7 @@ def test_check_grad_ingore_x(self):
             max_relative_error=0.005,
             no_grad_set=set("X"),
             check_prim=self.check_prim,
+            check_new_ir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -65,6 +68,7 @@ def test_check_grad_ingore_y(self):
             max_relative_error=0.005,
             no_grad_set=set('Y'),
             check_prim=self.check_prim,
+            check_new_ir=True,
         )
 
     def if_check_prim(self):
@@ -116,7 +120,12 @@ def test_check_grad_normal(self):
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['Y'], 'Out', no_grad_set=set("X"), max_relative_error=0.1
+            place,
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            max_relative_error=0.1,
+            check_new_ir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -128,6 +137,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             max_relative_error=0.1,
             check_prim=True,
+            check_new_ir=True,
         )
 
 
@@ -372,10 +382,12 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output(check_dygraph=False, check_new_ir=False)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', check_dygraph=False)
+        self.check_grad(
+            ['X', 'Y'], 'Out', check_dygraph=False, check_new_ir=False
+        )
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -384,6 +396,7 @@ def test_check_grad_ingore_x(self):
             max_relative_error=0.005,
             no_grad_set=set("X"),
             check_dygraph=False,
+            check_new_ir=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -393,6 +406,7 @@ def test_check_grad_ingore_y(self):
             max_relative_error=0.005,
             no_grad_set=set('Y'),
             check_dygraph=False,
+            check_new_ir=False,
         )
 
 
@@ -427,24 +441,36 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_dygraph=False)
+        self.check_output_with_place(
+            place, check_dygraph=False, check_new_ir=False
+        )
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X', 'Y'], 'Out', check_dygraph=False
+            place, ['X', 'Y'], 'Out', check_dygraph=False, check_new_ir=False
         )
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['Y'], 'Out', no_grad_set=set("X"), check_dygraph=False
+            place,
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            check_dygraph=False,
+            check_new_ir=False,
         )
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', no_grad_set=set('Y'), check_dygraph=False
+            place,
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            check_dygraph=False,
+            check_new_ir=False,
         )
 
 
@@ -810,13 +836,11 @@ def init_input_output(self):
         self.out = self.x - self.y
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
-            ['X', 'Y'],
-            'Out',
-            check_prim=self.check_prim,
+            ['X', 'Y'], 'Out', check_prim=self.check_prim, check_new_ir=False
         )
 
     def test_check_grad_ingore_x(self):
@@ -825,6 +849,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=self.check_prim,
+            check_new_ir=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -833,6 +858,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=self.check_prim,
+            check_new_ir=False,
         )
 
     def if_enable_cinn(self):

From bab444ec6c994d297b7ce3b53eb9569bfef744a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Fri, 22 Sep 2023 09:42:19 +0800
Subject: [PATCH 044/115] Support blip2-opt2.7b (#57167)

support fused_bias_act_kernel with relu
---
 .../fluid/operators/fused/fused_multi_transformer_op.cu.h | 4 ++++
 paddle/phi/infermeta/multiary.cc                          | 2 +-
 paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu    | 5 +++++
 paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h      | 8 ++++++++
 .../phi/kernels/fusion/gpu/masked_multihead_attention.cu  | 5 +++++
 5 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index 9587f9a61096c..f27644f1abd0d 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -1256,6 +1256,10 @@ void fmha(const phi::GPUContext &dev_ctx,
     case 64:
       fmha_launch_kernel<T, 64, 64>(params, dev_ctx.stream());
       break;
+    // opt model
+    case 80:
+      fmha_launch_kernel<T, 80, 128>(params, dev_ctx.stream());
+      break;
     case 96:
       fmha_launch_kernel<T, 96, 128>(params, dev_ctx.stream());
       break;
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 6b09dd22db263..d426924e26abb 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1510,7 +1510,7 @@ void FusedBiasActInferMeta(const MetaTensor& x,
             "The seconde dimension of x must be even, but receive %d", dim));
     dim /= 2;
     out->set_dims(phi::make_ddim({token_num, dim}));
-  } else if (act_method == "gelu") {
+  } else if (act_method == "gelu" || act_method == "relu") {
     out->set_dims(phi::make_ddim({token_num, dim}));
   } else {
     PADDLE_THROW(
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
index 8f75d91fc682d..2629ee4fdd6b9 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
@@ -221,6 +221,11 @@ void ComputeImpl(const Context &dev_ctx,
       LaunchBiasAct<T, Context, GeluFunctor<T>, LoadFunc, StoreFunc, LoadT>(
           dev_ctx, bias_data, rows, cols, load_func, store_func);
     }
+  } else if (act_method == "relu") {
+    VLOG(8) << "Doing RELU";
+    // for opt model
+    LaunchBiasAct<T, Context, ReluFunctor<T>, LoadFunc, StoreFunc, LoadT>(
+        dev_ctx, bias_data, rows, cols, load_func, store_func);
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Currently Only Support GeGLU, SwiGLU, GeLU"));
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
index 93ed50ec4e0df..96d159e091f14 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
@@ -100,6 +100,14 @@ struct GeluFunctor {
   }
 };
 
+template <typename T>
+struct ReluFunctor {
+  inline __host__ __device__ T operator()(const T x) const {
+    T zero = static_cast<T>(0.0);
+    return x > zero ? x : zero;
+  }
+};
+
 template <typename T>
 struct FastGeluFunctor {
   inline __device__ T operator()(const T x) const {
diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu
index 312f81ae31a61..8554378d3d4b1 100644
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention.cu
@@ -964,6 +964,11 @@ void fmha_impl(const phi::GPUContext &dev_ctx,
       fmha_launch_kernel<T, 64, 64>(
           params, dev_ctx.stream(), load_func, store_func);
       break;
+    // for opt model
+    case 80:
+      fmha_launch_kernel<T, 80, 128>(
+          params, dev_ctx.stream(), load_func, store_func);
+      break;
     case 96:
       fmha_launch_kernel<T, 96, 128>(
           params, dev_ctx.stream(), load_func, store_func);

From 9796bb86f4c46a179855d94f3ef682e6d5759752 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 22 Sep 2023 09:42:30 +0800
Subject: [PATCH 045/115] [AutoParallel] Generate replicated spmd for PHI API
 and verify DP MP strategy  (#57505)

* generate forward defalut spmd

* generate bwd default spmd rule

* test relu and mse forward success

* test mse loss fwd and bwd

* updarte replicated rule name

* update single strategy test

* add unittests

* polish details

* remove useless seed

* fix dist branch test error
---
 paddle/fluid/eager/tensor_wrapper.h           |  13 +-
 paddle/fluid/eager/utils.h                    |  32 ++-
 paddle/phi/api/lib/api_gen_utils.cc           |  18 +-
 paddle/phi/api/lib/api_gen_utils.h            |  11 +-
 paddle/phi/api/lib/data_transform.cc          |  83 +++++--
 paddle/phi/api/lib/data_transform.h           |   6 +
 paddle/phi/api/yaml/generator/dist_api_gen.py | 190 +++++++++++-----
 .../phi/api/yaml/generator/dist_bw_api_gen.py |  40 +++-
 .../distributed/auto_parallel/dist_tensor.cc  |  10 +-
 .../distributed/auto_parallel/dist_tensor.h   |   4 +
 .../auto_parallel/p_to_r_reshard_function.cc  |   3 +
 .../auto_parallel/r_to_p_reshard_function.cc  |   3 +
 .../auto_parallel/r_to_s_reshard_function.cc  |   1 +
 .../distributed/auto_parallel/reshard_utils.h |   9 +
 .../auto_parallel/s_to_r_reshard_function.cc  |   1 +
 .../auto_parallel/s_to_s_reshard_function.cc  |   3 +
 .../phi/core/distributed/store/tcp_store.cc   |  42 ++--
 test/auto_parallel/CMakeLists.txt             |   6 +-
 .../semi_auto_parallel_for_matmul.py          |  29 ++-
 .../semi_auto_parallel_for_replicated_spmd.py | 111 +++++++++
 .../semi_auto_parallel_simple_net.py          | 210 ++++++++++++++++++
 test/auto_parallel/test_api_dist_branch.py    |  60 ++++-
 test/auto_parallel/test_dist_tensor.py        |  70 ------
 .../test_semi_auto_parallel_basic.py          |  22 +-
 ...test_semi_auto_parallel_single_strategy.py |  40 ++++
 25 files changed, 799 insertions(+), 218 deletions(-)
 create mode 100644 test/auto_parallel/semi_auto_parallel_for_replicated_spmd.py
 create mode 100644 test/auto_parallel/semi_auto_parallel_simple_net.py
 create mode 100644 test/auto_parallel/test_semi_auto_parallel_single_strategy.py

diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index f94ec416d4f3c..7aa6ec8e6cddb 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -69,15 +69,16 @@ class TensorWrapper {
             std::make_shared<phi::Allocation>(nullptr, 0, tensor.place()),
             dense_tensor->meta()));
       } else if (phi::distributed::DistTensor::classof(tensor.impl().get())) {
-        // Only Copy Meta
+        // Copy Global dims, DistAttr and DenseTensorMeta
         phi::distributed::DistTensor* dist_tensor =
             static_cast<phi::distributed::DistTensor*>(tensor.impl().get());
-        intermidiate_tensor_.set_impl(
+        auto no_buffer_dist_tensor =
             std::make_shared<phi::distributed::DistTensor>(
-                phi::DenseTensor(std::make_shared<phi::Allocation>(
-                                     nullptr, 0, tensor.place()),
-                                 dist_tensor->value().meta()),
-                dist_tensor->dist_attr()));
+                dist_tensor->dims(), dist_tensor->dist_attr());
+        *no_buffer_dist_tensor->unsafe_mutable_value() = phi::DenseTensor(
+            std::make_shared<phi::Allocation>(nullptr, 0, tensor.place()),
+            dist_tensor->value().meta());
+        intermidiate_tensor_.set_impl(no_buffer_dist_tensor);
       } else {
         PADDLE_THROW(paddle::platform::errors::Fatal(
             "Unrecognized tensor type for no_need_buffer feature"));
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 7f2fcbb9ba771..35fa60d87c2ff 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -308,10 +308,10 @@ class EagerUtils {
         "Type: %s, Dtype: %s, Place: %s, Shape: %s, DistAttr: %s";
     std::string tensor_info_str = "";
     if (t.defined()) {
-      if (t.initialized()) {
-        if (t.is_dist_tensor()) {
-          auto dist_t =
-              std::static_pointer_cast<phi::distributed::DistTensor>(t.impl());
+      if (t.is_dist_tensor()) {
+        auto dist_t =
+            std::static_pointer_cast<phi::distributed::DistTensor>(t.impl());
+        if (t.initialized()) {
           tensor_info_str += paddle::string::Sprintf(
               TENSOR_INFO_TEMPLATE,
               t.impl()->type_info().name(),
@@ -321,6 +321,16 @@ class EagerUtils {
                   "%s, Local Shape: %s", t.dims(), dist_t->local_dims()),
               dist_t->dist_attr());
         } else {
+          tensor_info_str +=
+              paddle::string::Sprintf(TENSOR_INFO_TEMPLATE,
+                                      t.impl()->type_info().name(),
+                                      "Unknown",
+                                      "Unknown",
+                                      t.dims(),
+                                      dist_t->dist_attr());
+        }
+      } else {
+        if (t.initialized()) {
           tensor_info_str +=
               paddle::string::Sprintf(TENSOR_INFO_TEMPLATE,
                                       t.impl()->type_info().name(),
@@ -328,13 +338,15 @@ class EagerUtils {
                                       t.place().DebugString(),
                                       t.dims(),
                                       "Unknown");
+        } else {
+          tensor_info_str +=
+              paddle::string::Sprintf(TENSOR_INFO_TEMPLATE,
+                                      t.impl()->type_info().name(),
+                                      "Unknown",
+                                      "Unknown",
+                                      "Unknown",
+                                      "Unknown");
         }
-      } else {
-        tensor_info_str += paddle::string::Sprintf(TENSOR_INFO_TEMPLATE,
-                                                   t.impl()->type_info().name(),
-                                                   "Unknown",
-                                                   "Unknown",
-                                                   "Unknown");
       }
     } else {
       tensor_info_str += "Unknown";
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index 2e914b3aa9653..3098145b801c7 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -551,8 +551,12 @@ phi::distributed::DistTensor* SetKernelDistOutput(
 }
 
 std::shared_ptr<phi::distributed::DistTensor> CreateKernelDistOutput(
-    const phi::distributed::TensorDistAttr& dist_attr) {
-  return std::make_shared<phi::distributed::DistTensor>(phi::DDim(), dist_attr);
+    Tensor* out, const phi::distributed::TensorDistAttr& dist_attr) {
+  if (out) {
+    return std::make_shared<phi::distributed::DistTensor>(phi::DDim(),
+                                                          dist_attr);
+  }
+  return nullptr;
 }
 
 std::vector<phi::distributed::DistTensor*> SetKernelDistOutput(
@@ -617,6 +621,16 @@ std::vector<phi::distributed::DistTensor*> SetKernelDistInplaceOptionalOutput(
   }
   return results;
 }
+void SetReplicatedDistAttrForOutput(
+    phi::distributed::DistTensor* out,
+    const phi::distributed::ProcessMesh& process_mesh) {
+  if (out) {
+    auto dist_attr =
+        phi::distributed::TensorDistAttr(phi::vectorize(out->dims()));
+    dist_attr.set_process_mesh(process_mesh);
+    out->unsafe_set_dist_attr(dist_attr);
+  }
+}
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 1cec467630aca..a57d951ce738f 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -145,7 +145,9 @@ phi::distributed::DistTensor* SetKernelDistOutput(
         phi::distributed::TensorDistAttr());
 
 std::shared_ptr<phi::distributed::DistTensor> CreateKernelDistOutput(
-    const phi::distributed::TensorDistAttr& dist_attr);
+    Tensor* out,
+    const phi::distributed::TensorDistAttr& dist_attr =
+        phi::distributed::TensorDistAttr());
 
 std::vector<phi::distributed::DistTensor*> SetKernelDistOutput(
     std::vector<Tensor*> out);
@@ -159,5 +161,12 @@ std::vector<phi::distributed::DistTensor*> SetKernelDistInplaceOutput(
 std::vector<phi::distributed::DistTensor*> SetKernelDistInplaceOptionalOutput(
     size_t out_size, paddle::optional<std::vector<Tensor>> out);
 
+// DistTensor need to set initial dist attr after the dims setted, it is
+// constructed based dims and current process mesh, beforce calling this
+// function, the out should hold correct dims
+void SetReplicatedDistAttrForOutput(
+    phi::distributed::DistTensor* out,
+    const phi::distributed::ProcessMesh& process_mesh);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index afa029cf2e7e5..8c9a57f264db4 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -601,6 +601,14 @@ void TransDataBackend(const phi::SelectedRows* tensor,
 
 /* ------------------ for auto parallel ----------------------- */
 
+static bool ReshardIsNeeded(
+    const phi::distributed::TensorDistAttr& in_dist_attr,
+    const phi::distributed::TensorDistAttr& out_dist_attr) {
+  return (in_dist_attr.process_mesh() != out_dist_attr.process_mesh() ||
+          in_dist_attr.dims_mapping() != out_dist_attr.dims_mapping() ||
+          in_dist_attr.partial_status() != out_dist_attr.partial_status());
+}
+
 std::string ReshardDebugInfo(
     const phi::distributed::DistTensor& src_tensor,
     const phi::distributed::TensorDistAttr& dist_attr) {
@@ -620,8 +628,8 @@ std::shared_ptr<phi::distributed::DistTensor> ReshardApiInputToKernelInput(
   if (tensor_in) {
     phi::distributed::DistTensor* dist_tensor =
         static_cast<phi::distributed::DistTensor*>(tensor_in.get());
-    if (dist_tensor->dist_attr() != dist_attr) {
-      VLOG(6) << "FwdAPI ApiIn to KernelIn - "
+    if (ReshardIsNeeded(dist_tensor->dist_attr(), dist_attr)) {
+      VLOG(6) << "ApiIn to KernelIn - "
               << ReshardDebugInfo(*dist_tensor, dist_attr);
       auto* func = phi::distributed::ChooseProperReshardFunction(*dist_tensor,
                                                                  dist_attr);
@@ -632,6 +640,36 @@ std::shared_ptr<phi::distributed::DistTensor> ReshardApiInputToKernelInput(
   return nullptr;
 }
 
+std::shared_ptr<phi::distributed::DistTensor>
+ReshardApiInputToReplicatedKernelInput(
+    phi::DeviceContext* dev_ctx,
+    const Tensor& tensor,
+    const phi::distributed::TensorDistAttr& dist_attr) {
+  auto tensor_in = tensor.impl();
+  if (tensor_in) {
+    phi::distributed::DistTensor* dist_tensor =
+        static_cast<phi::distributed::DistTensor*>(tensor_in.get());
+    if (ReshardIsNeeded(dist_tensor->dist_attr(), dist_attr)) {
+      VLOG(6) << "ApiIn to Replicated KernelIn - "
+              << ReshardDebugInfo(*dist_tensor, dist_attr);
+      if (dist_tensor->initialized()) {
+        auto* func = phi::distributed::ChooseProperReshardFunction(*dist_tensor,
+                                                                   dist_attr);
+        return func->Eval(dev_ctx, *dist_tensor, dist_attr);
+      } else {
+        // when no tensor data need to be reshard, we still need to set correct
+        // replicated dist attr and local dims for output
+        dist_tensor->unsafe_set_dist_attr(dist_attr);
+        auto dense_tensor_meta = dist_tensor->value().meta();
+        dense_tensor_meta.dims = dist_tensor->dims();
+        dist_tensor->unsafe_mutable_value()->set_meta(dense_tensor_meta);
+      }
+    }
+    return std::static_pointer_cast<phi::distributed::DistTensor>(tensor_in);
+  }
+  return nullptr;
+}
+
 void ReshardOutputPartialAxisToReplicated(
     phi::DeviceContext* dev_ctx, phi::distributed::DistTensor* out_tensor) {
   if (out_tensor->dist_attr().is_partial()) {
@@ -649,25 +687,30 @@ void ReshardKernelOutputToApiOutput(
     phi::DeviceContext* dev_ctx,
     const std::shared_ptr<phi::distributed::DistTensor>& src_tensor,
     Tensor* dst_tensor) {
-  auto tensor_out = dst_tensor->impl();
-  PADDLE_ENFORCE_NE(
-      tensor_out,
-      nullptr,
-      phi::errors::InvalidArgument("The output tensor is nullptr."));
-  phi::distributed::DistTensor* dist_tensor =
-      static_cast<phi::distributed::DistTensor*>(tensor_out.get());
-  dist_tensor->unsafe_set_dims(src_tensor->dims());
-  if (src_tensor->dist_attr() != dist_tensor->dist_attr()) {
-    VLOG(6) << "BwdAPI KernelOut to ApiOut - "
-            << ReshardDebugInfo(*src_tensor, dist_tensor->dist_attr());
-    auto* func = phi::distributed::ChooseProperReshardFunction(
-        *src_tensor, dist_tensor->dist_attr());
-    func->Eval(dev_ctx, *src_tensor, dist_tensor->dist_attr(), dist_tensor);
+  if (dst_tensor) {
+    auto tensor_out = dst_tensor->impl();
+    PADDLE_ENFORCE_NE(
+        tensor_out,
+        nullptr,
+        phi::errors::InvalidArgument("The output tensor is nullptr."));
+    phi::distributed::DistTensor* dist_tensor =
+        static_cast<phi::distributed::DistTensor*>(tensor_out.get());
+    dist_tensor->unsafe_set_dims(src_tensor->dims());
+    if (ReshardIsNeeded(src_tensor->dist_attr(), dist_tensor->dist_attr())) {
+      VLOG(6) << "BwdAPI KernelOut to ApiOut - "
+              << ReshardDebugInfo(*src_tensor, dist_tensor->dist_attr());
+      auto* func = phi::distributed::ChooseProperReshardFunction(
+          *src_tensor, dist_tensor->dist_attr());
+      func->Eval(dev_ctx, *src_tensor, dist_tensor->dist_attr(), dist_tensor);
+    } else {
+      // TODO(chenweihang): add dist attr compare and default copy rule to
+      // avoid add branch here
+      // shallow copy dense tensor
+      *dist_tensor->unsafe_mutable_value() = src_tensor->value();
+    }
   } else {
-    // TODO(chenweihang): add dist attr compare and default copy rule to
-    // avoid add branch here
-    // shallow copy dense tensor
-    *dist_tensor->unsafe_mutable_value() = src_tensor->value();
+    VLOG(3) << "The output tensor is nullptr when call "
+               "ReshardKernelOutputToApiOutput.";
   }
 }
 
diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h
index faa9d572899eb..25c0e4137aa7f 100644
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -180,6 +180,12 @@ std::shared_ptr<phi::distributed::DistTensor> ReshardApiInputToKernelInput(
     const Tensor& tensor,
     const phi::distributed::TensorDistAttr& dist_attr);
 
+std::shared_ptr<phi::distributed::DistTensor>
+ReshardApiInputToReplicatedKernelInput(
+    phi::DeviceContext* dev_ctx,
+    const Tensor& tensor,
+    const phi::distributed::TensorDistAttr& dist_attr);
+
 void ReshardOutputPartialAxisToReplicated(
     phi::DeviceContext* dev_ctx, phi::distributed::DistTensor* out_tensor);
 
diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
index 4d51c72a0639c..c9885dec64c97 100644
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -46,7 +46,7 @@
   if ({}) {{
     // 1. InferSpmd (Infer DistAttr of Inputs&Outputs){}
     // 2. Create API Output & Prepare Dist and Dense Output{}
-    // 3. Infer DistTensor's Global Shape{}
+    // 3. Infer DistTensor's Global Shape{}\n
     // 4. Select Kernel{}
     // 5. Reshard Input{}\n
     // 6. PrepareData (DataTransform & Prepare Dense Input){}
@@ -63,10 +63,16 @@
 
 # 1. InferSPMD
 SINGLE_DIST_META_IN_TEMPLATE = """
-    auto meta_dist_{} = MakeDistMetaTensor(*{}.impl());"""
+    auto meta_dist_input_{} = MakeDistMetaTensor(*{}.impl());"""
 INFER_SPMD_TEMPLATE = """
     auto spmd_info = phi::distributed::{}({});
 """
+GENERAL_INFER_SPMD_TEMPLATE = """
+    auto spmd_info = phi::distributed::VariadicReplicatedInferSpmd({});
+"""
+UNSUPPORTED_INFER_SPMD_COMMENT_TEMPLATE = """
+    // API `{}` does not support InferSpmd now
+"""
 
 # 2. Create API Outputs
 API_OUT_CREATION_TEMPLATE = """
@@ -162,6 +168,9 @@
 INFER_GLOBAL_SHAPE_TEMPLATE = """
     phi::{}({}{});
 """
+# Dist Branch will not generated in the API that doesn't have input tensor.
+SET_SINGLE_OUT_REPLICATED_DIST_ATTR = """
+    SetReplicatedDistAttrForOutput({}, spmd_info.first[0].process_mesh());"""
 
 # 4. Select Kernel
 KERNEL_SELECTION_TEMPLATE = """
@@ -176,6 +185,11 @@
 # 5. Reshard Input
 SINGLE_INPUT_RESHARD_TEMPLATE = """
     auto dist_input_{arg} = ReshardApiInputToKernelInput(dev_ctx, {arg}, spmd_info.first[{idx}]);"""
+SINGLE_GENERAL_INPUT_RESHARD_TEMPLATE = """
+    auto dist_input_{arg} = ReshardApiInputToReplicatedKernelInput(dev_ctx, {arg}, spmd_info.first[{idx}]);"""
+UNSUPPORTED_RESHARD_INPUT_COMMENT_TEMPLATE = """
+    // API `{}` does not need to support ReshardInput at this time
+"""
 
 # 6. PrepareData
 SINGLE_PREPARE_DATA_TEMPLATE = """
@@ -286,6 +300,9 @@
     ReshardOutputPartialAxisToReplicated(dev_ctx, dist_out);"""
 RESHARD_P2R_MULTI_SINGLE_OUTPUT_TEMPLATE = """
     ReshardOutputPartialAxisToReplicated(dev_ctx, dist_out_{});"""
+UNSUPPORTED_RESHARD_OUTPUT_COMMENT_TEMPLATE = """
+    // API `{}` does not need to support ReshardOutput now
+"""
 
 # BaseAPI members:
 # inputs:
@@ -335,6 +352,8 @@ def init_dist_api_members(self):
         self.inplace_flag = False
         self.dist_output_args = []
         self.dense_output_args = []
+        self.generate_infer_spmd = False
+        self.generate_general_infer_spmd = False
 
     # override BaseAPI's method
     def parse_infer_meta(self, infer_meta_config):
@@ -382,48 +401,103 @@ def generate_if_condition_code(self) -> str:
             input_args = input_args[:-2]
         return AUTO_PARALLEL_COND_TEMPLATE.format(input_args)
 
-    def generate_infer_spmd_code(self) -> str:
-        if self.infer_meta['spmd_rule'] is not None:
-            input_names = self.inputs['names']
-            attr_names = self.attrs['names']
-            kernel_param = self.kernel['param']
-            if kernel_param is None:
-                kernel_param = input_names + attr_names
-
-            input_decl_code = ""
-            input_args_code = ""
-            for param in kernel_param:
-                if param in input_names:
-                    if self.inputs['input_info'][param] == "const Tensor&":
-                        input_decl_code += SINGLE_DIST_META_IN_TEMPLATE.format(
-                            param, param
-                        )
-                        input_args_code += "meta_dist_" + param + ", "
-                    else:
-                        raise ValueError(
-                            f"{self.api} : Param of infer_spmd error : {self.inputs['input_info'][param]} type is not supported."
-                        )
-                elif param in attr_names:
-                    input_args_code = input_args_code + param + ", "
-                elif isinstance(param, str):
-                    input_args_code = input_args_code + "\"" + param + "\", "
-                elif isinstance(param, bool):
-                    input_args_code = (
-                        input_args_code + str(param).lower() + ", "
+    def generate_specialized_infer_spmd_code(self) -> str:
+        input_names = self.inputs['names']
+        attr_names = self.attrs['names']
+
+        # TODO(chenweihang): here we need to use infer_meta params,
+        # if it is inconsistent, you need to change the infermeta func
+        kernel_params = self.kernel['param']
+        if kernel_params is None:
+            kernel_params = input_names + attr_names
+
+        input_decl_code = ""
+        input_args_code = ""
+        for param in kernel_params:
+            if param in input_names:
+                if self.inputs['input_info'][param] == "const Tensor&":
+                    input_decl_code += SINGLE_DIST_META_IN_TEMPLATE.format(
+                        param, param
                     )
+                    input_args_code += "meta_dist_input_" + param + ", "
                 else:
-                    input_args_code = input_args_code + str(param) + ", "
+                    raise ValueError(
+                        f"{self.api} : Param of infer_spmd error : {self.inputs['input_info'][param]} type is not supported."
+                    )
+            elif param in attr_names:
+                input_args_code = input_args_code + param + ", "
+            elif isinstance(param, str):
+                input_args_code = input_args_code + "\"" + param + "\", "
+            elif isinstance(param, bool):
+                input_args_code = input_args_code + str(param).lower() + ", "
+            else:
+                input_args_code = input_args_code + str(param) + ", "
 
-            # TODO(chenweihang): add general spmd rule later
-            infer_spmd_code = ""
-            infer_spmd_func_code = self.infer_meta['spmd_rule']
-            infer_spmd_code = INFER_SPMD_TEMPLATE.format(
-                infer_spmd_func_code, input_args_code[:-2]
-            )
+        infer_spmd_code = ""
+        infer_spmd_func_code = self.infer_meta['spmd_rule']
+        infer_spmd_code = INFER_SPMD_TEMPLATE.format(
+            infer_spmd_func_code, input_args_code[:-2]
+        )
+        self.generate_infer_spmd = True
+
+        return input_decl_code + infer_spmd_code
+
+    def generate_general_infer_spmd_code(self) -> str:
+        input_names = self.inputs['names']
+        attr_names = self.attrs['names']
 
-            return input_decl_code + infer_spmd_code
+        # TODO(chenweihang): here we need use infer_meta params,
+        # if it is inconsistent, you need to change the infermeta func
+        kernel_params = self.kernel['param']
+        if kernel_params is None:
+            kernel_params = input_names + attr_names
+
+        input_decl_code = ""
+        input_args_code = ""
+        for param in kernel_params:
+            if param in input_names:
+                if self.inputs['input_info'][param] == "const Tensor&":
+                    input_decl_code += SINGLE_DIST_META_IN_TEMPLATE.format(
+                        param, param
+                    )
+                    input_args_code += "meta_dist_input_" + param + ", "
+                elif (
+                    self.inputs['input_info'][param]
+                    == "const std::vector<Tensor>&"
+                    or self.inputs['input_info'][param]
+                    == "const paddle::optional<Tensor>&"
+                    or self.inputs['input_info'][param]
+                    == "const paddle::optional<std::vector<Tensor>>&"
+                ):
+                    # TODO(chenweihang): support other input type later,
+                    # now only support single tensor input api
+                    input_decl_code = ""
+                    input_args_code = ""
+                    break
+                else:
+                    raise ValueError(
+                        f"{self.api} : Param of infer_spmd error : {self.inputs['input_info'][param]} type is not supported."
+                    )
+            else:
+                # do nothing
+                pass
+
+        if input_decl_code == "":
+            return UNSUPPORTED_INFER_SPMD_COMMENT_TEMPLATE.format(self.api)
+
+        infer_spmd_code = GENERAL_INFER_SPMD_TEMPLATE.format(
+            input_args_code[:-2]
+        )
+        self.generate_infer_spmd = True
+        self.generate_general_infer_spmd = True
+
+        return input_decl_code + infer_spmd_code
+
+    def generate_infer_spmd_code(self) -> str:
+        if self.infer_meta['spmd_rule'] is not None:
+            return self.generate_specialized_infer_spmd_code()
         else:
-            return ""
+            return self.generate_general_infer_spmd_code()
 
     def generate_output_creation_code(self) -> str:
         # forward api need to generate api and kernel outputs
@@ -601,6 +675,7 @@ def generate_infer_global_shape_code(self) -> str:
         # 3. get meta tensor output args
         output_decl_code = ""
         output_args_code = ""
+        set_out_dist_attr_code = ""
         for i, out_name in enumerate(self.dist_output_args):
             if self.outputs['types'][i] == 'std::vector<Tensor>':
                 output_decl_code += VECTOR_GLOBAL_META_OUT_DECL_TEMPLATE.format(
@@ -617,6 +692,10 @@ def generate_infer_global_shape_code(self) -> str:
                     output_args_code += (
                         f"{out_name} ? &meta_{out_name} : nullptr, "
                     )
+                if self.generate_general_infer_spmd is True:
+                    set_out_dist_attr_code += (
+                        SET_SINGLE_OUT_REPLICATED_DIST_ATTR.format(out_name)
+                    )
         output_args_code = output_args_code[:-2]
 
         return (
@@ -625,6 +704,7 @@ def generate_infer_global_shape_code(self) -> str:
             + INFER_GLOBAL_SHAPE_TEMPLATE.format(
                 infer_meta_func_code, input_args_code, output_args_code
             )
+            + set_out_dist_attr_code
         )
 
     def generate_kernel_selection_code(self) -> str:
@@ -634,7 +714,7 @@ def generate_kernel_selection_code(self) -> str:
 
     def generate_reshard_input_code(self) -> str:
         input_reshard_code = ""
-        if self.infer_meta['spmd_rule'] is not None:
+        if self.generate_infer_spmd is True:
             input_names = self.inputs['names']
 
             kernel_params = (
@@ -642,14 +722,22 @@ def generate_reshard_input_code(self) -> str:
                 if self.kernel['param'] is not None
                 else input_names
             )
+
             for i, param in enumerate(kernel_params):
                 if param in input_names:
                     if self.inputs['input_info'][param] == "const Tensor&":
-                        input_reshard_code += (
-                            SINGLE_INPUT_RESHARD_TEMPLATE.format(
-                                arg=param, idx=i
+                        if self.generate_general_infer_spmd is True:
+                            input_reshard_code += (
+                                SINGLE_GENERAL_INPUT_RESHARD_TEMPLATE.format(
+                                    arg=param, idx=i
+                                )
+                            )
+                        else:
+                            input_reshard_code += (
+                                SINGLE_INPUT_RESHARD_TEMPLATE.format(
+                                    arg=param, idx=i
+                                )
                             )
-                        )
                     else:
                         raise ValueError(
                             f"{self.api} : Param of reshard input error : {self.inputs['input_info'][param]} type is not supported."
@@ -658,8 +746,10 @@ def generate_reshard_input_code(self) -> str:
                     # do nothing
                     pass
         else:
-            # do nothingd
-            pass
+            input_reshard_code = (
+                UNSUPPORTED_RESHARD_INPUT_COMMENT_TEMPLATE.format(self.api)
+            )
+
         return input_reshard_code
 
     def generate_single_dense_input(
@@ -674,7 +764,7 @@ def generate_single_dense_input(
         if kernel_param is None:
             kernel_param = input_names + attr_names
 
-        if self.infer_meta['spmd_rule'] is not None:
+        if self.generate_infer_spmd is True:
             input_tensor_code += SINGLE_PREPARE_DATA_TEMPLATE.format(
                 arg=input_name,
                 idx=kernel_param.index(input_name),
@@ -802,7 +892,6 @@ def generate_prepare_data_code(self) -> str:
     def generate_infer_meta_code(self) -> str:
         input_names = self.inputs['names']
         attr_names = self.attrs['names']
-        output_names = self.outputs['names']
 
         # 1. get infer meta func name
         infer_meta = self.infer_meta
@@ -989,8 +1078,9 @@ def generate_reshard_partial_out_to_replicated_code(self) -> str:
                     )
                 )
         else:
-            # do nothing
-            pass
+            reshard_p2r_code = (
+                UNSUPPORTED_RESHARD_OUTPUT_COMMENT_TEMPLATE.format(self.api)
+            )
 
         return reshard_p2r_code
 
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index 656efba0bc387..e5c89a2ad54e4 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -27,7 +27,7 @@
   if ({}) {{
     // 1. InferSpmd (Infer DistAttr of Inputs&Outputs){}
     // 2. Create Temporary Output & Prepare Dist and Dense Output{}
-    // 3. Infer DistTensor's Global Shape{}
+    // 3. Infer DistTensor's Global Shape{}\n
     // 4. Select Kernel{}
     // 5. Reshard Input{}\n
     // 6. PrepareData (DataTransform & Prepare Dense Input){}
@@ -44,9 +44,15 @@
     auto dist_out = SetKernelDistOutput({});
     auto dense_out = dist_out->unsafe_mutable_value();
 """
+SINGLE_OUT_CREATION_TEMPLATE_WITH_SPMD = """
+    std::shared_ptr<phi::distributed::DistTensor> shared_dist_out =
+        CreateKernelDistOutput({}, spmd_info.second[0]);
+    phi::distributed::DistTensor* dist_out = shared_dist_out.get();
+    phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value();
+"""
 SINGLE_OUT_CREATION_TEMPLATE = """
     std::shared_ptr<phi::distributed::DistTensor> shared_dist_out =
-        CreateKernelDistOutput(spmd_info.second[0]);
+        CreateKernelDistOutput({});
     phi::distributed::DistTensor* dist_out = shared_dist_out.get();
     phi::DenseTensor* dense_out = dist_out->unsafe_mutable_value();
 """
@@ -71,17 +77,17 @@
     auto dist_out_{idx} = SetKernelDistOutput({name});
     auto dense_out_{idx} = dist_out_{idx}->unsafe_mutable_value();
 """
-MULTI_SINGLE_OUT_CREATION_TEMPLATE = """
+MULTI_SINGLE_OUT_CREATION_TEMPLATE_WITH_SPMD = """
     std::shared_ptr<phi::distributed::DistTensor> shared_dist_out_{idx} =
-        CreateKernelDistOutput(spmd_info.second[{idx}]);
+        CreateKernelDistOutput({name}, spmd_info.second[{idx}]);
     phi::distributed::DistTensor* dist_out_{idx} = shared_dist_out_{idx}.get();
-    phi::DenseTensor* dense_out_{idx} = dist_out_{idx}->unsafe_mutable_value();
+    phi::DenseTensor* dense_out_{idx} = dist_out_{idx} ? dist_out_{idx}->unsafe_mutable_value() : nullptr;
 """
-
-# 4. PrepareData (DataTransform & Prepare Dist and Dense Input)
-SINGLE_PREPARE_DATA_TEMPLATE = """
-    auto dist_input_{arg} = PrepareDataForDistTensor({arg}, GetKernelInputArgDef(kernel.InputAt({idx}), kernel_backend), {flag}, kernel_result.is_stride_kernel);
-    auto input_{arg} = &dist_input_{}->value();
+MULTI_SINGLE_OUT_CREATION_TEMPLATE = """
+    std::shared_ptr<phi::distributed::DistTensor> shared_dist_out_{idx} =
+        CreateKernelDistOutput({name});
+    phi::distributed::DistTensor* dist_out_{idx} = shared_dist_out_{idx}.get();
+    phi::DenseTensor* dense_out_{idx} = dist_out_{idx} ? dist_out_{idx}->unsafe_mutable_value() : nullptr;
 """
 MULTI_VECTOR_OUT_CREATION_TEMPLATE = """
     auto dist_out_{i} = SetKernelDistOutput({name});
@@ -113,6 +119,12 @@ def generate_output_creation_code(self) -> str:
             self.dense_output_args.append('dense_out')
             if self.outputs['types'][0] == 'Tensor':
                 if self.infer_meta['spmd_rule'] is not None:
+                    output_creation_code += (
+                        SINGLE_OUT_CREATION_TEMPLATE_WITH_SPMD.format(
+                            self.outputs['names'][0]
+                        )
+                    )
+                elif self.generate_general_infer_spmd is True:
                     output_creation_code += SINGLE_OUT_CREATION_TEMPLATE.format(
                         self.outputs['names'][0]
                     )
@@ -134,6 +146,12 @@ def generate_output_creation_code(self) -> str:
                 self.dense_output_args.append(f'dense_out_{i}')
                 if out_type == 'Tensor':
                     if self.infer_meta['spmd_rule'] is not None:
+                        output_creation_code += (
+                            MULTI_SINGLE_OUT_CREATION_TEMPLATE_WITH_SPMD.format(
+                                name=self.outputs['names'][i], idx=i
+                            )
+                        )
+                    elif self.generate_general_infer_spmd is True:
                         output_creation_code += (
                             MULTI_SINGLE_OUT_CREATION_TEMPLATE.format(
                                 name=self.outputs['names'][i], idx=i
@@ -208,7 +226,7 @@ def gene_api_declaration(self) -> str:
 
     def generate_reshard_output_code(self):
         reshard_output_code = ""
-        if self.infer_meta['spmd_rule'] is not None:
+        if self.generate_infer_spmd is True:
             output_num = len(self.outputs['types'])
             if output_num == 1:
                 if self.outputs['types'][0] == 'Tensor':
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
index b9103a00c9d02..6edc0bf188ee5 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -35,7 +35,7 @@ inline void check_defined(const DistTensor& dist_tensor,
 DistTensor::DistTensor(const phi::DenseTensor& global_value,
                        const TensorDistAttr& dist_attr)
     : dims_(global_value.dims()), dist_attr_(dist_attr), value_(global_value) {
-  if (!dist_attr.is_replicated()) {
+  if (value_.initialized() && !dist_attr.is_replicated()) {
     // 1. create replicated global tensor
     int64_t dims_size = global_value.dims().size();
     std::vector<int64_t> dims_mapping(dims_size, -1);
@@ -63,6 +63,14 @@ void DistTensor::unsafe_set_dims(const DDim& dims) {
   dims_ = dims;
 }
 
+void DistTensor::unsafe_set_dist_attr(const TensorDistAttr& dist_attr) {
+  if (this->initialized()) {
+    VLOG(3) << "You try to set an initialized DistTensor's dist attr. "
+               "Make sure you are aware of where you change its dist attr.";
+  }
+  dist_attr_ = dist_attr;
+}
+
 int64_t DistTensor::numel() const {
   check_defined(*this, "numel");
   return value_.numel();
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
index 1289a23b1be8c..c965733a7e0e8 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
@@ -62,6 +62,10 @@ class DistTensor final
   /// \return The TensorDistAttr's const reference
   const TensorDistAttr& dist_attr() const { return dist_attr_; }
 
+  /// \brief Set the dist attr of current dist tensor.
+  /// \return void
+  void unsafe_set_dist_attr(const TensorDistAttr& dist_attr);
+
   /// \brief Returns the dense tensor value's const reference in dist tensor.
   /// \return The DenseTensor value's const reference
   const DenseTensor& value() const { return value_; }
diff --git a/paddle/phi/core/distributed/auto_parallel/p_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/p_to_r_reshard_function.cc
index 9478077685c63..f9aaa6f8adf7f 100644
--- a/paddle/phi/core/distributed/auto_parallel/p_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/p_to_r_reshard_function.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/core/distributed/auto_parallel/p_to_r_reshard_function.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
@@ -43,6 +45,7 @@ void PToRReshardFunction::Eval(DeviceContext* dev_ctx,
                                const DistTensor& in,
                                const TensorDistAttr& out_dist_attr,
                                DistTensor* out) {
+  VLOG(3) << "Call PToRReshardFunction Eval";
   const auto& in_dist_attr = in.dist_attr();
   const auto& in_process_mesh = in_dist_attr.process_mesh();
   const auto& in_process_ids = in_process_mesh.process_ids();
diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
index bd2cb4c58a46c..77569c1ecfbac 100644
--- a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
@@ -45,6 +47,7 @@ void RToPReshardFunction::Eval(phi::DeviceContext* dev_ctx,
                                const DistTensor& in,
                                const TensorDistAttr& out_dist_attr,
                                DistTensor* out) {
+  VLOG(3) << "Call RToPReshardFunction Eval";
   const auto& out_process_mesh = out_dist_attr.process_mesh();
   int64_t local_rank = GetCurRankCoordInMesh(out_process_mesh)[0];
   IntArray shape(in.dims().Get(), in.dims().size());
diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
index 381f77991f72d..bc6cb393a15b8 100644
--- a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
@@ -45,6 +45,7 @@ void RToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
                                const DistTensor& in,
                                const TensorDistAttr& out_dist_attr,
                                DistTensor* out) {
+  VLOG(3) << "Call RToSReshardFunction Eval";
   const auto& out_dims_mapping = out_dist_attr.dims_mapping();
   const auto& out_process_mesh = out_dist_attr.process_mesh();
   const DenseTensor& in_physical_tensor_cur_rank = in.value();
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
index b947c70bb5bc9..831a4c6e0d2af 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
@@ -59,12 +59,14 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
 #define RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, ...)            \
   do {                                                                \
     if (phi::CPUContext::classof(dev_ctx)) {                          \
+      VLOG(4) << "Call `" << #fn_name << "` in Resharding on GPU.";   \
       PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                           \
           dtype, #fn_name, ([&] {                                     \
             fn_name<data_t>(static_cast<const CPUContext&>(*dev_ctx), \
                             __VA_ARGS__);                             \
           }));                                                        \
     } else if (phi::GPUContext::classof(dev_ctx)) {                   \
+      VLOG(4) << "Call `" << #fn_name << "` in Resharding on CPU.";   \
       PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                           \
           dtype, #fn_name, ([&] {                                     \
             fn_name<data_t>(static_cast<const GPUContext&>(*dev_ctx), \
@@ -80,6 +82,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
 #define RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, ...)                \
   do {                                                                    \
     if (phi::CPUContext::classof(dev_ctx)) {                              \
+      VLOG(4) << "Call `" << #fn_name << "` in Resharding on CPU.";       \
       PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                               \
           dtype, #fn_name, ([&] {                                         \
             fn_name<data_t>(static_cast<const CPUContext&>(*dev_ctx),     \
@@ -108,8 +111,12 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
 #define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...)          \
   do {                                                                \
     if (phi::CPUContext::classof(dev_ctx)) {                          \
+      VLOG(4) << "Call `" << #fn_name                                 \
+              << "`without DType in Resharding on CPU.";              \
       fn_name(static_cast<const CPUContext&>(*dev_ctx), __VA_ARGS__); \
     } else if (phi::GPUContext::classof(dev_ctx)) {                   \
+      VLOG(4) << "Call `" << #fn_name                                 \
+              << "`without DType in Resharding on GPU.";              \
       fn_name(static_cast<const GPUContext&>(*dev_ctx), __VA_ARGS__); \
     } else {                                                          \
       PADDLE_THROW(phi::errors::Unimplemented(                        \
@@ -121,6 +128,8 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
 #define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...)              \
   do {                                                                    \
     if (phi::CPUContext::classof(dev_ctx)) {                              \
+      VLOG(4) << "Call `" << #fn_name                                     \
+              << "`without DType in Resharding on CPU.";                  \
       fn_name(static_cast<const CPUContext&>(*dev_ctx), __VA_ARGS__);     \
     } else {                                                              \
       PADDLE_THROW(phi::errors::Unimplemented(                            \
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
index efa5035c495ed..db8a26088ae45 100644
--- a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
@@ -56,6 +56,7 @@ void SToRReshardFunction::Eval(DeviceContext* dev_ctx,
                                const DistTensor& in,
                                const TensorDistAttr& out_dist_attr,
                                DistTensor* out) {
+  VLOG(3) << "Call SToRReshardFunction Eval";
   const auto& in_dist_attr = in.dist_attr();
   const auto& in_dims_mapping = in_dist_attr.dims_mapping();
   const auto& in_process_mesh = in_dist_attr.process_mesh();
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
index 45ec290973446..29aa1256e0193 100644
--- a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
@@ -46,6 +48,7 @@ void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
                                const DistTensor& in,
                                const TensorDistAttr& out_dist_attr,
                                DistTensor* out) {
+  VLOG(3) << "Call SToSReshardFunction Eval";
   const auto& in_process_mesh = in.dist_attr().process_mesh();
   const auto& in_process_ids = in_process_mesh.process_ids();
   auto dtype = in.dtype();
diff --git a/paddle/phi/core/distributed/store/tcp_store.cc b/paddle/phi/core/distributed/store/tcp_store.cc
index 1c18a8b12caf1..9650d051f98fb 100644
--- a/paddle/phi/core/distributed/store/tcp_store.cc
+++ b/paddle/phi/core/distributed/store/tcp_store.cc
@@ -33,7 +33,7 @@ constexpr int INFTIME = 10000;  // 10 seconds
 std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
                                                   int nranks,
                                                   int timeout) {
-  VLOG(4) << ("begin to run start");
+  VLOG(8) << ("begin to run start");
   return std::make_unique<MasterDaemon>(socket, nranks, timeout);
 }
 
@@ -44,7 +44,7 @@ MasterDaemon::MasterDaemon(SocketType socket, int nranks, int timeout)
 }
 
 MasterDaemon::~MasterDaemon() {  // NOLINT
-  VLOG(4) << ("begin to destruct MasterDaemon");
+  VLOG(8) << ("begin to destruct MasterDaemon");
   StopByControlFd();
   _background_thread.join();
   tcputils::close_socket(_listen_socket);
@@ -70,7 +70,7 @@ void MasterDaemon::_do_add(SocketType socket) {
   std::string new_value_str = std::to_string(new_value);
   _store[key] =
       std::vector<uint8_t>(new_value_str.begin(), new_value_str.end());
-  VLOG(4) << "TCPStore: new value (" << new_value << ") for key (" << key
+  VLOG(8) << "TCPStore: new value (" << new_value << ") for key (" << key
           << ") " << GetSockName(socket);
   tcputils::send_value<int64_t>(socket, new_value);
   _notify_waiting_sockets(key);
@@ -78,7 +78,7 @@ void MasterDaemon::_do_add(SocketType socket) {
 
 void MasterDaemon::_do_set(SocketType socket) {
   std::string key = tcputils::receive_string(socket);
-  VLOG(4) << "MasterDaemon::_do_set key(" << key << ") " << GetSockName(socket);
+  VLOG(8) << "MasterDaemon::_do_set key(" << key << ") " << GetSockName(socket);
 
   auto value = tcputils::receive_vector<uint8_t>(socket);
   _store[key] = value;
@@ -89,7 +89,7 @@ void MasterDaemon::_notify_waiting_sockets(const std::string& key) {
   if (_waiting_sockets.find(key) != _waiting_sockets.end()) {
     for (auto waiting_socket : _waiting_sockets.at(key)) {
       auto reply = ReplyType::STOP_WAIT;
-      VLOG(3) << "TCPStore: notify the socket: " << GetSockName(waiting_socket)
+      VLOG(7) << "TCPStore: notify the socket: " << GetSockName(waiting_socket)
               << " that key: " << key << " is ready.";
       tcputils::send_value<ReplyType>(waiting_socket, reply);
     }
@@ -99,7 +99,7 @@ void MasterDaemon::_notify_waiting_sockets(const std::string& key) {
 
 void MasterDaemon::_do_get(SocketType socket) {
   std::string key = tcputils::receive_string(socket);
-  VLOG(4) << "MasterDaemon::_do_get key(" << key << ") " << GetSockName(socket);
+  VLOG(8) << "MasterDaemon::_do_get key(" << key << ") " << GetSockName(socket);
 
   auto iter = _store.find(key);
   PADDLE_ENFORCE_NE(
@@ -125,7 +125,7 @@ void MasterDaemon::CloseControlFd() {
   }
 }
 void MasterDaemon::StopByControlFd() {
-  VLOG(4) << ("begin to run StopByControlFd");
+  VLOG(8) << ("begin to run StopByControlFd");
   if (_control_fd[1] != -1) {
     PADDLE_ENFORCE_NE(
         ::write(_control_fd[1], "\0", 1),
@@ -149,7 +149,7 @@ void MasterDaemon::StopByControlFd() { SetEvent(ghStopEvent_); }
 
 void MasterDaemon::_do_wait(SocketType socket) {
   std::string key = tcputils::receive_string(socket);
-  VLOG(4) << "MasterDaemon::_do_wait key(" << key << ") "
+  VLOG(8) << "MasterDaemon::_do_wait key(" << key << ") "
           << GetSockName(socket);
 
   auto iter = _store.find(key);
@@ -158,7 +158,7 @@ void MasterDaemon::_do_wait(SocketType socket) {
     _waiting_sockets[key].emplace_back(socket);
   } else {
     auto reply = ReplyType::STOP_WAIT;
-    VLOG(3) << "TCPStore: wait reply (" << static_cast<int>(reply)
+    VLOG(7) << "TCPStore: wait reply (" << static_cast<int>(reply)
             << ") for key (" << key << ").";
     tcputils::send_value<ReplyType>(socket, reply);
   }
@@ -179,9 +179,9 @@ void MasterDaemon::ProcessCommands(std::vector<struct pollfd>* p_fds) {
         continue;
       }
 
-      VLOG(4) << "Plan to receive command from " << GetSockName(fds[i].fd);
+      VLOG(8) << "Plan to receive command from " << GetSockName(fds[i].fd);
       Command command = tcputils::receive_value<Command>(fds[i].fd);
-      VLOG(3) << "TCPStore: recv command: " << static_cast<int>(command) << ".";
+      VLOG(7) << "TCPStore: recv command: " << static_cast<int>(command) << ".";
 
       switch (command) {
         case Command::ADD:
@@ -197,7 +197,7 @@ void MasterDaemon::ProcessCommands(std::vector<struct pollfd>* p_fds) {
           _do_wait(fds[i].fd);
           break;
         default:
-          VLOG(4) << "Unknown command: " << static_cast<int>(command)
+          VLOG(8) << "Unknown command: " << static_cast<int>(command)
                   << " from addr info:" << GetSockName(fds[i].fd);
       }
     } catch (const std::exception& ex) {
@@ -352,7 +352,7 @@ TCPStore::TCPStore(std::string host,
   PADDLE_ENFORCE_GT(
       timeout, 0, phi::errors::InvalidArgument("timeout must >= %d", timeout));
 
-  VLOG(3) << "input timeout" << timeout << ", member timeout:" << _timeout;
+  VLOG(7) << "input timeout" << timeout << ", member timeout:" << _timeout;
   if (_is_master) {
     _server = detail::TCPServer::create(port, this->_num_workers, timeout);
   }
@@ -368,12 +368,12 @@ void TCPStore::waitWorkers() {
   add(_init_key, 1);
 
   if (_is_master) {
-    VLOG(3) << paddle::string::Sprintf("_timeout:%d", _timeout);
+    VLOG(7) << paddle::string::Sprintf("_timeout:%d", _timeout);
     auto begin = std::chrono::steady_clock::now();
     do {
       auto value = get(_init_key);
       int completed = std::stoi(std::string(value.begin(), value.end()));
-      VLOG(3) << completed << " worker ready, total " << _num_workers
+      VLOG(7) << completed << " worker ready, total " << _num_workers
               << ", _timeout:" << _timeout;
       if (completed >= _num_workers) {
         break;
@@ -397,18 +397,18 @@ void TCPStore::waitWorkers() {
       }
     } while (true);
   }
-  VLOG(3) << "TCPStore initialized.";
+  VLOG(7) << "TCPStore initialized.";
 }
 
 int64_t TCPStore::add(const std::string& key, int64_t value) {
-  VLOG(3) << "TCPStore add.";
+  VLOG(7) << "TCPStore add.";
   _client->send_command_for_key(Command::ADD, _key_prefix + key);
   _client->send_value<std::int64_t>(value);
   return _client->receive_value<std::int64_t>();
 }
 
 void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
-  VLOG(3) << "TCPStore set.";
+  VLOG(7) << "TCPStore set.";
   _client->send_command_for_key(Command::SET, _key_prefix + key);
   _client->send_vector<uint8_t>(value);
 }
@@ -416,13 +416,13 @@ void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
 std::vector<uint8_t> TCPStore::get(const std::string& key) {
   wait(key);
   _client->send_command_for_key(Command::GET, _key_prefix + key);
-  VLOG(3) << "TCPStore get.";
+  VLOG(7) << "TCPStore get.";
   return _client->receive_vector<uint8_t>();
 }
 
 void TCPStore::wait(const std::string& key) {
   ReplyType reply;
-  VLOG(3) << "TCPStore wait.";
+  VLOG(7) << "TCPStore wait.";
   _client->send_command_for_key(Command::WAIT, _key_prefix + key);
   reply = _client->receive_value<ReplyType>();
   PADDLE_ENFORCE_EQ(
@@ -431,7 +431,7 @@ void TCPStore::wait(const std::string& key) {
       phi::errors::InvalidArgument("Stop_waiting response is expected"));
 }
 
-TCPStore::~TCPStore() { VLOG(3) << "TCPStore destructure"; }
+TCPStore::~TCPStore() { VLOG(7) << "TCPStore destructure"; }
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 873f9f057e9ab..48df714387854 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -109,7 +109,11 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_semi_auto_parallel_basic MODULES
                   test_semi_auto_parallel_basic)
   set_tests_properties(test_semi_auto_parallel_basic
-                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
+  py_test_modules(test_semi_auto_parallel_single_strategy MODULES
+                  test_semi_auto_parallel_single_strategy)
+  set_tests_properties(test_semi_auto_parallel_single_strategy
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
   py_test_modules(test_gpt_with_newir MODULES test_gpt_with_newir)
   set_tests_properties(test_gpt_with_newir
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
diff --git a/test/auto_parallel/semi_auto_parallel_for_matmul.py b/test/auto_parallel/semi_auto_parallel_for_matmul.py
index 9deed0b90cd71..bba31234ed80b 100644
--- a/test/auto_parallel/semi_auto_parallel_for_matmul.py
+++ b/test/auto_parallel/semi_auto_parallel_for_matmul.py
@@ -23,36 +23,45 @@
 class TestMatmulApiForSemiAutoParallel:
     def __init__(self):
         self._dtype = os.getenv("dtype")
-        self._seeds = eval(os.getenv("seeds"))
         self._backend = os.getenv("backend")
         self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
 
+        paddle.seed(2023)
+        np.random.seed(2023)
+
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(np1, np2, rtol=1e-05, verbose=True)
+
     def test_body(
         self, x_shape, y_shape, x_specs, y_specs, trans_x=False, trans_y=False
     ):
-        x = paddle.randn(x_shape, self._dtype)
-        y = paddle.randn(y_shape, self._dtype)
+        x_np = np.random.random(size=x_shape).astype(self._dtype)
+        y_np = np.random.random(size=y_shape).astype(self._dtype)
+        x = paddle.to_tensor(x_np)
+        y = paddle.to_tensor(y_np)
         x.stop_gradient = False
         y.stop_gradient = False
 
         x_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=x_specs)
         y_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=y_specs)
 
-        dist_x = dist.shard_tensor(x, dist_attr=x_dist_attr)
-        dist_y = dist.shard_tensor(y, dist_attr=y_dist_attr)
+        dist_x = dist.shard_tensor(x_np, dist_attr=x_dist_attr)
+        dist_y = dist.shard_tensor(y_np, dist_attr=y_dist_attr)
         dist_x.stop_gradient = False
         dist_y.stop_gradient = False
 
+        out = paddle.matmul(x, y, transpose_x=trans_x, transpose_y=trans_y)
         dist_out = paddle.matmul(
             dist_x, dist_y, transpose_x=trans_x, transpose_y=trans_y
         )
-        # verify global shape
-        out_shape = [64, 48]
-        np.testing.assert_equal(dist_out.shape, out_shape, verbose=True)
+        self.check_tensor_eq(out, dist_out)
 
+        out.backward()
         dist_out.backward()
-        np.testing.assert_equal(dist_x.grad.shape, x_shape, verbose=True)
-        np.testing.assert_equal(dist_y.grad.shape, y_shape, verbose=True)
+        self.check_tensor_eq(x.grad, dist_x.grad)
+        self.check_tensor_eq(y.grad, dist_y.grad)
 
         return dist_out, dist_x.grad, dist_y.grad
 
diff --git a/test/auto_parallel/semi_auto_parallel_for_replicated_spmd.py b/test/auto_parallel/semi_auto_parallel_for_replicated_spmd.py
new file mode 100644
index 0000000000000..87a171091c961
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_for_replicated_spmd.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+
+
+class TestReplicatedSPmdApiForSemiAutoParallel:
+    def __init__(self):
+        self._dtype = os.getenv("dtype")
+        self._backend = os.getenv("backend")
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+        paddle.seed(2023)
+        np.random.seed(2023)
+
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(np1, np2, rtol=1e-05, verbose=True)
+
+    def create_local_and_dist_tensor_pair(self, np_array, sharding_specs):
+        local_t = paddle.to_tensor(np_array, dtype=np_array.dtype)
+
+        dist_attr = dist.DistAttr(
+            mesh=self._mesh, sharding_specs=sharding_specs
+        )
+        dist_t = dist.shard_tensor(np_array, dist_attr=dist_attr)
+
+        local_t.stop_gradient = False
+        dist_t.stop_gradient = False
+
+        return local_t, dist_t
+
+    # input: phi::Tensor
+    # output: phi::Tensor
+    def test_relu(self):
+        x = np.random.random(size=[4, 4]).astype(self._dtype)
+        local_in, dist_in = self.create_local_and_dist_tensor_pair(
+            x, ['x', None]
+        )
+        local_out = F.relu(local_in)
+        dist_out = F.relu(dist_in)
+        np.testing.assert_equal(
+            dist_out.dist_attr.dims_mapping, [-1, -1], verbose=True
+        )
+        self.check_tensor_eq(local_out, dist_out)
+
+        # test backward
+        local_out.backward()
+        dist_out.backward()
+        np.testing.assert_equal(dist_in.grad._local_shape, [2, 4], verbose=True)
+        np.testing.assert_equal(
+            dist_in.grad.dist_attr.dims_mapping, [0, -1], verbose=True
+        )
+        self.check_tensor_eq(local_in.grad, dist_in.grad)
+
+    def test_mse_loss(self):
+        x = np.random.random(size=[4, 4]).astype(self._dtype)
+        y = np.random.random(size=[4]).astype(self._dtype)
+        local_in, dist_in = self.create_local_and_dist_tensor_pair(
+            x, ['x', None]
+        )
+        local_label, dist_label = self.create_local_and_dist_tensor_pair(
+            y, [None]
+        )
+
+        mes_loss = paddle.nn.loss.MSELoss()
+        local_out = mes_loss(local_in, local_label)
+        dist_out = mes_loss(dist_in, dist_label)
+        self.check_tensor_eq(local_out, dist_out)
+
+        # test backward
+        local_out.backward()
+        dist_out.backward()
+        np.testing.assert_equal(dist_in.grad._local_shape, [2, 4], verbose=True)
+        np.testing.assert_equal(
+            dist_in.grad.dist_attr.dims_mapping, [0, -1], verbose=True
+        )
+        self.check_tensor_eq(local_in.grad, dist_in.grad)
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.test_relu()
+        self.test_mse_loss()
+
+
+if __name__ == '__main__':
+    TestReplicatedSPmdApiForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/semi_auto_parallel_simple_net.py b/test/auto_parallel/semi_auto_parallel_simple_net.py
new file mode 100644
index 0000000000000..1e0b1a92859fc
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_simple_net.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle import nn
+
+BATCH_SIZE = 16
+BATCH_NUM = 4
+IMAGE_SIZE = 784
+CLASS_NUM = 10
+
+
+# TODO(chenweihang): update to MLP Layer later
+class DemoNet(nn.Layer):
+    def __init__(self, np_w0, np_w1):
+        super().__init__()
+        self.w0 = self.create_parameter(
+            shape=[IMAGE_SIZE, IMAGE_SIZE],
+            attr=paddle.framework.ParamAttr(
+                name="demo_weight_1",
+                initializer=paddle.nn.initializer.Assign(np_w0),
+            ),
+        )
+        self.w1 = self.create_parameter(
+            shape=[IMAGE_SIZE, CLASS_NUM],
+            attr=paddle.framework.ParamAttr(
+                name="nemo_weight_2",
+                initializer=paddle.nn.initializer.Assign(np_w1),
+            ),
+        )
+
+    def forward(self, x):
+        y = paddle.matmul(x, self.w0)
+        z = paddle.matmul(y, self.w1)
+        return z
+
+
+class DPDemoNet(nn.Layer):
+    def __init__(self, np_w0, np_w1, mesh):
+        super().__init__()
+        self.replicate_dist_attr = dist.DistAttr(
+            mesh=mesh, sharding_specs=[None, None]
+        )
+        self.shard_axis0_dist_attr = dist.DistAttr(
+            mesh=mesh, sharding_specs=['x', None]
+        )
+        self.w0 = dist.shard_tensor(
+            self.create_parameter(
+                shape=[IMAGE_SIZE, IMAGE_SIZE],
+                attr=paddle.framework.ParamAttr(
+                    name="dp_demo_weight_1",
+                    initializer=paddle.nn.initializer.Assign(np_w0),
+                ),
+            ),
+            dist_attr=self.replicate_dist_attr,
+        )
+        self.w1 = dist.shard_tensor(
+            self.create_parameter(
+                shape=[IMAGE_SIZE, CLASS_NUM],
+                attr=paddle.framework.ParamAttr(
+                    name="dp_nemo_weight_2",
+                    initializer=paddle.nn.initializer.Assign(np_w1),
+                ),
+            ),
+            dist_attr=self.replicate_dist_attr,
+        )
+
+    def forward(self, x):
+        y = paddle.matmul(
+            dist.shard_tensor(x, dist_attr=self.shard_axis0_dist_attr),
+            self.w0,
+        )
+        z = paddle.matmul(y, self.w1)
+        return z
+
+
+class MPDemoNet(nn.Layer):
+    def __init__(self, np_w0, np_w1, mesh):
+        super().__init__()
+        self.replicate_dist_attr = dist.DistAttr(
+            mesh=mesh, sharding_specs=[None, None]
+        )
+        self.shard_axis0_dist_attr = dist.DistAttr(
+            mesh=mesh, sharding_specs=['x', None]
+        )
+        self.shard_axis1_dist_attr = dist.DistAttr(
+            mesh=mesh, sharding_specs=['x', None]
+        )
+        self.w0 = dist.shard_tensor(
+            self.create_parameter(
+                shape=[IMAGE_SIZE, IMAGE_SIZE],
+                attr=paddle.framework.ParamAttr(
+                    name="mp_demo_weight_1",
+                    initializer=paddle.nn.initializer.Assign(np_w0),
+                ),
+            ),
+            dist_attr=self.shard_axis1_dist_attr,
+        )
+        self.w1 = dist.shard_tensor(
+            self.create_parameter(
+                shape=[IMAGE_SIZE, CLASS_NUM],
+                attr=paddle.framework.ParamAttr(
+                    name="mp_nemo_weight_2",
+                    initializer=paddle.nn.initializer.Assign(np_w1),
+                ),
+            ),
+            dist_attr=self.shard_axis0_dist_attr,
+        )
+
+    def forward(self, x):
+        y = paddle.matmul(
+            dist.shard_tensor(x, dist_attr=self.replicate_dist_attr), self.w0
+        )
+        z = paddle.matmul(y, self.w1)
+        return z
+
+
+class TestSimpleNetForSemiAutoParallel:
+    def __init__(self):
+        self._dtype = os.getenv("dtype")
+        self._backend = os.getenv("backend")
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+        paddle.set_device(self._backend)
+
+        self.init_input_data()
+
+        self.init_single_card_net_result()
+
+    def init_input_data(self):
+        paddle.seed(2023)
+        np.random.seed(2023)
+
+        self.image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype(
+            'float32'
+        )
+        self.label = np.random.random([BATCH_SIZE, CLASS_NUM]).astype('float32')
+        self.w0 = np.random.random([IMAGE_SIZE, IMAGE_SIZE]).astype('float32')
+        self.w1 = np.random.random([IMAGE_SIZE, CLASS_NUM]).astype('float32')
+
+    # TODO(chenweihang): optimizer cannot run auto-parallel now
+    def run_dynamic(self, layer, parallel=False):
+        # create loss
+        loss_fn = nn.MSELoss()
+        # run forward and backward
+        image = paddle.to_tensor(self.image)
+        out = layer(image)
+        label = (
+            dist.shard_tensor(
+                self.label,
+                dist_attr=dist.DistAttr(
+                    mesh=self._mesh, sharding_specs=[None, None]
+                ),
+            )
+            if parallel is True
+            else paddle.to_tensor(self.label)
+        )
+        loss = loss_fn(out, label)
+        loss.backward()
+        return loss, layer.w0.grad, layer.w1.grad
+
+    def init_single_card_net_result(self):
+        self.base_loss, self.base_w0_grad, self.base_w1_grad = self.run_dynamic(
+            DemoNet(self.w0, self.w1)
+        )
+
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(np1, np2, rtol=1e-05, verbose=True)
+
+    def test_dp_demo_net(self):
+        self.dp_loss, self.dp_w0_grad, self.dp_w1_grad = self.run_dynamic(
+            DPDemoNet(self.w0, self.w1, self._mesh), parallel=True
+        )
+        self.check_tensor_eq(self.dp_loss, self.base_loss)
+        self.check_tensor_eq(self.dp_w0_grad, self.base_w0_grad)
+        self.check_tensor_eq(self.dp_w1_grad, self.base_w1_grad)
+
+    def test_mp_demo_net(self):
+        self.mp_loss, self.mp_w0_grad, self.mp_w1_grad = self.run_dynamic(
+            MPDemoNet(self.w0, self.w1, self._mesh), parallel=True
+        )
+        self.check_tensor_eq(self.mp_loss, self.base_loss)
+        self.check_tensor_eq(self.mp_w0_grad, self.base_w0_grad)
+        self.check_tensor_eq(self.mp_w1_grad, self.base_w1_grad)
+
+    def run_test_case(self):
+        self.test_dp_demo_net()
+        self.test_mp_demo_net()
+
+
+if __name__ == '__main__':
+    TestSimpleNetForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/test_api_dist_branch.py b/test/auto_parallel/test_api_dist_branch.py
index c9a56f17b2f0c..970a6199e07d3 100644
--- a/test/auto_parallel/test_api_dist_branch.py
+++ b/test/auto_parallel/test_api_dist_branch.py
@@ -15,13 +15,51 @@
 import unittest
 
 import numpy as np
-from test_dist_tensor import TestDistTensorForDygraphAPI
 
 import paddle
+import paddle.distributed as dist
 
 
 # For API generation which have different type of DistTensor Input and Output
-class TestAPIForDistBranch(TestDistTensorForDygraphAPI):
+class TestDygraphAPIForDistTensorBranch(unittest.TestCase):
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(np1, np2, rtol=1e-05)
+
+    def create_local_and_dist_tensor_pair(self, np_array):
+        if np_array.dtype == np.float32:
+            local_t = paddle.to_tensor(np_array, dtype='float32')
+        elif np_array.dtype == np.float16:
+            local_t = paddle.to_tensor(np_array, dtype='float16')
+        elif np_array.dtype == np.int32:
+            local_t = paddle.to_tensor(np_array, dtype='int32')
+        elif np_array.dtype == np.bool_:
+            local_t = paddle.to_tensor(np_array, dtype='bool')
+
+        mesh = dist.ProcessMesh([0], dim_names=["x"])
+        dist_attr = dist.DistAttr(
+            mesh=mesh, sharding_specs=[None] * np_array.ndim
+        )
+        dist_t = dist.shard_tensor(np_array, dist_attr=dist_attr)
+
+        local_t.stop_gradient = False
+        dist_t.stop_gradient = False
+
+        return local_t, dist_t
+
+    def create_local_and_dist_tensor_list_pair(self, np_array_list):
+        assert isinstance(
+            np_array_list, list
+        ), "input should be list of np_array!"
+        local_t_list = []
+        dist_t_list = []
+        for np_array in np_array_list:
+            local_t, dist_t = self.create_local_and_dist_tensor_pair(np_array)
+            local_t_list.append(local_t)
+            dist_t_list.append(dist_t)
+        return local_t_list, dist_t_list
+
     # input: std::vector<phi::Tensor>
     # output: phi::Tensor
     def test_concat_for_dist_tensor(self):
@@ -55,8 +93,8 @@ def test_broadcast_tensors_for_dist_tensor(self):
         self.check_tensor_eq(local_out1, dist_out1)
         self.check_tensor_eq(local_out2, dist_out2)
 
-        local_out = local_out1 + local_out2
-        dist_out = dist_out1 + dist_out2
+        local_out = paddle.concat([local_out1, local_out2])
+        dist_out = paddle.concat([dist_out1, dist_out2])
 
         local_out.backward()
         dist_out.backward()
@@ -73,8 +111,8 @@ def test_unbind_for_dist_tensor(self):
         self.check_tensor_eq(local_out1, dist_out1)
         self.check_tensor_eq(local_out2, dist_out2)
 
-        local_out = local_out1 + local_out2
-        dist_out = dist_out1 + dist_out2
+        local_out = paddle.concat([local_out1, local_out2])
+        dist_out = paddle.concat([dist_out1, dist_out2])
 
         local_out.backward()
         dist_out.backward()
@@ -90,9 +128,13 @@ def test_expand_as_for_dist_tensor(self):
         local_out = paddle.expand_as(local_in1, local_in2)
         dist_out = paddle.expand_as(dist_in1, dist_in2)
         self.check_tensor_eq(local_out, dist_out)
-        local_out.backward()
-        dist_out.backward()
-        self.check_tensor_eq(local_in1.grad, dist_in1.grad)
+
+        # TODO(chenweihang): expand_as is a special case, the forward contains
+        # optional input, but backward not, open this case after dist support
+        # optional input
+        # local_out.backward()
+        # dist_out.backward()
+        # self.check_tensor_eq(local_in1.grad, dist_in1.grad)
 
     # input: paddle::optional<phi::Tensor>
     # output: phi::Tensor
diff --git a/test/auto_parallel/test_dist_tensor.py b/test/auto_parallel/test_dist_tensor.py
index f5fd7e335d598..b631d5ecba6e8 100644
--- a/test/auto_parallel/test_dist_tensor.py
+++ b/test/auto_parallel/test_dist_tensor.py
@@ -18,7 +18,6 @@
 
 import paddle
 import paddle.distributed as dist
-import paddle.nn.functional as F
 
 
 class TestDistTensor(unittest.TestCase):
@@ -120,74 +119,5 @@ def test_static_mode(self):
         paddle.disable_static()
 
 
-class TestDistTensorForDygraphAPI(unittest.TestCase):
-    def check_tensor_eq(self, a, b):
-        np1 = a.numpy()
-        np2 = b.numpy()
-        np.testing.assert_allclose(np1, np2, rtol=1e-05)
-
-    def create_local_and_dist_tensor_pair(self, np_array):
-        if np_array.dtype == np.float32:
-            local_t = paddle.to_tensor(np_array, dtype='float32')
-        elif np_array.dtype == np.float16:
-            local_t = paddle.to_tensor(np_array, dtype='float16')
-        elif np_array.dtype == np.int32:
-            local_t = paddle.to_tensor(np_array, dtype='int32')
-        elif np_array.dtype == np.bool_:
-            local_t = paddle.to_tensor(np_array, dtype='bool')
-
-        mesh = dist.ProcessMesh([0], dim_names=["x"])
-        dist_attr = dist.DistAttr(
-            mesh=mesh, sharding_specs=[None] * np_array.ndim
-        )
-        dist_t = dist.shard_tensor(np_array, dist_attr=dist_attr)
-
-        local_t.stop_gradient = False
-        dist_t.stop_gradient = False
-
-        return local_t, dist_t
-
-    def create_local_and_dist_tensor_list_pair(self, np_array_list):
-        assert isinstance(
-            np_array_list, list
-        ), "input should be list of np_array!"
-        local_t_list = []
-        dist_t_list = []
-        for np_array in np_array_list:
-            local_t, dist_t = self.create_local_and_dist_tensor_pair(np_array)
-            local_t_list.append(local_t)
-            dist_t_list.append(dist_t)
-        return local_t_list, dist_t_list
-
-    # input: phi::Tensor
-    # output: phi::Tensor
-    def test_relu_api_for_dist_tensor(self):
-        x = np.random.random(size=[4, 4]).astype("float32")
-        local_in, dist_in = self.create_local_and_dist_tensor_pair(x)
-        local_out = F.relu(local_in)
-        dist_out = F.relu(dist_in)
-        self.check_tensor_eq(local_out, dist_out)
-
-        # test backward
-        local_out.backward()
-        dist_out.backward()
-        self.check_tensor_eq(local_in.grad, dist_in.grad)
-
-    def test_matmul_api_for_dist_tensor(self):
-        x = np.random.random(size=[4, 4]).astype("float32")
-        y = np.random.random(size=[4, 4]).astype("float32")
-        local_x, dist_x = self.create_local_and_dist_tensor_pair(x)
-        local_y, dist_y = self.create_local_and_dist_tensor_pair(y)
-        local_out = paddle.matmul(local_x, local_y)
-        dist_out = paddle.matmul(dist_x, dist_y)
-        self.check_tensor_eq(local_out, dist_out)
-
-        # test backward
-        local_out.backward()
-        dist_out.backward()
-        self.check_tensor_eq(local_x.grad, dist_x.grad)
-        self.check_tensor_eq(local_y.grad, dist_y.grad)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py
index a1ec1b18e9b33..3fe98e4d08744 100644
--- a/test/auto_parallel/test_semi_auto_parallel_basic.py
+++ b/test/auto_parallel/test_semi_auto_parallel_basic.py
@@ -17,13 +17,13 @@
 import collective.test_communication_api_base as test_base
 
 
-class TestSemiAutoParallelMatmul(test_base.CommunicationTestDistBase):
+class TestSemiAutoParallelBasic(test_base.CommunicationTestDistBase):
     def setUp(self):
-        super().setUp(num_of_devices=2, timeout=120)
-        self._default_envs = {
-            "dtype": "float32",
-            "seeds": str(self._seeds),
-        }
+        super().setUp(
+            num_of_devices=2,
+            timeout=120,
+        )
+        self._default_envs = {"dtype": "float32"}
         self._changeable_envs = {"backend": ["cpu", "gpu"]}
 
     def test_matmul_api(self):
@@ -36,6 +36,16 @@ def test_matmul_api(self):
                 user_defined_envs=envs,
             )
 
+    def test_several_replicated_spmd_api(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_for_replicated_spmd.py",
+                user_defined_envs=envs,
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/auto_parallel/test_semi_auto_parallel_single_strategy.py b/test/auto_parallel/test_semi_auto_parallel_single_strategy.py
new file mode 100644
index 0000000000000..5c30f8b5954be
--- /dev/null
+++ b/test/auto_parallel/test_semi_auto_parallel_single_strategy.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestSemiAutoParallelSingleStrategy(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(num_of_devices=2, timeout=120)
+        self._default_envs = {
+            "dtype": "float32",
+        }
+        self._changeable_envs = {"backend": ["cpu", "gpu"]}
+
+    def test_simple_net_single_strategy(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_simple_net.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 75998168790ed3763b5f01cd5b7fc8e5ee747183 Mon Sep 17 00:00:00 2001
From: Zhan Rongrui <46243324+zrr1999@users.noreply.github.com>
Date: Fri, 22 Sep 2023 09:48:59 +0800
Subject: [PATCH 046/115] [PIR] No.28 Migrate paddle.gather_nd into pir
 (#57562)

* support gather_nd in pir

* update test

* add false

* fix bug
---
 python/paddle/tensor/manipulation.py  |  2 +-
 test/legacy_test/test_gather_nd_op.py | 85 ++++++++++++++++++---------
 2 files changed, 58 insertions(+), 29 deletions(-)

diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 62de7b884275b..2e366150d3632 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3798,7 +3798,7 @@ def gather_nd(x, index, name=None):
             output = paddle.gather_nd(x, index) #[[3, 4]]
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.gather_nd(x, index)
     else:
         check_variable_and_dtype(
diff --git a/test/legacy_test/test_gather_nd_op.py b/test/legacy_test/test_gather_nd_op.py
index 193a9856c9e67..dd1d996715eef 100644
--- a/test/legacy_test/test_gather_nd_op.py
+++ b/test/legacy_test/test_gather_nd_op.py
@@ -53,10 +53,10 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
 
 
 class TestGatherNdOpWithEmptyIndexFP16(TestGatherNdOpWithEmptyIndex):
@@ -75,11 +75,13 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, check_new_ir=False
+        )
 
 
 class TestGatherNdOpWithIndex1(OpTest):
@@ -112,10 +114,10 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
 
 
 class TestGatherNdOpWithIndex1_ZeroDim(TestGatherNdOpWithIndex1):
@@ -161,11 +163,13 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, check_new_ir=False
+        )
 
 
 class TestGatherNdOpWithLowIndex(OpTest):
@@ -198,10 +202,10 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
 
 
 class TestGatherNdOpWithLowIndexFP16(TestGatherNdOpWithLowIndex):
@@ -220,12 +224,17 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, numeric_grad_delta=0.5
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=False,
+            numeric_grad_delta=0.5,
         )
 
 
@@ -264,10 +273,16 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, numeric_grad_delta=0.05)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=False,
+            numeric_grad_delta=0.05,
+        )
 
 
 class TestGatherNdOpIndex1FP16(TestGatherNdOpIndex1):
@@ -286,12 +301,17 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, numeric_grad_delta=0.5
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=False,
+            numeric_grad_delta=0.5,
         )
 
 
@@ -322,10 +342,10 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
 
 
 class TestGatherNdOpWithSameIndexAsXFP16(TestGatherNdOpWithSameIndexAsX):
@@ -344,12 +364,17 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, numeric_grad_delta=0.5
+            place,
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=False,
+            numeric_grad_delta=0.5,
         )
 
 
@@ -382,10 +407,10 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
 
 
 class TestGatherNdOpWithHighRankSameFP16(TestGatherNdOpWithHighRankSame):
@@ -404,11 +429,13 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, check_new_ir=False
+        )
 
 
 class TestGatherNdOpWithHighRankDiff(OpTest):
@@ -441,10 +468,10 @@ def config_dtype(self):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
 
 
 class TestGatherNdOpWithHighRankDiffFP16(TestGatherNdOpWithHighRankDiff):
@@ -463,11 +490,13 @@ def config_dtype(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, check_new_ir=False
+        )
 
 
 # Test Python API

From 92f924958017218d08f20561b519a9cf1be52c64 Mon Sep 17 00:00:00 2001
From: 6clc <chaoliu.lc@foxmail.com>
Date: Fri, 22 Sep 2023 09:51:47 +0800
Subject: [PATCH 047/115] cinn(py-dsl): add ir context used in python dsl
 (#57515)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

拆分新特性：CINN Python DSL
单测和e2e测试在主开发PR上，详细见主开发PR描述 #56393

修改了IRCompare，支持对比IR的结构是否相同
封装了一些需要上下文信息才能构建的IR。 封装为IRContext类，开放出C++接口给Python层调用。包括下面的IR
---
 paddle/cinn/ir/lowered_func.cc             |  10 +
 paddle/cinn/ir/lowered_func.h              |  13 +-
 paddle/cinn/ir/module.cc                   |   2 +
 paddle/cinn/ir/module.h                    |   1 +
 paddle/cinn/ir/tensor.cc                   |  17 ++
 paddle/cinn/ir/tensor.h                    |   7 +
 paddle/cinn/ir/utils/ir_compare.cc         |  86 ++++---
 paddle/cinn/ir/utils/ir_compare.h          |  12 +-
 paddle/cinn/pybind/CMakeLists.txt          |   4 +-
 paddle/cinn/pybind/common.cc               |   3 +-
 paddle/cinn/pybind/ir/ir.cc                |  98 ++++++++
 paddle/cinn/pybind/ir/ir.h                 |  35 +++
 paddle/cinn/pybind/{ir.cc => ir/ir_api.cc} | 139 ++++++++++-
 paddle/cinn/pybind/ir/ir_context.cc        | 134 +++++++++++
 paddle/cinn/pybind/ir/ir_context.h         | 256 +++++++++++++++++++++
 paddle/cinn/runtime/cinn_runtime.h         |   3 +-
 16 files changed, 770 insertions(+), 50 deletions(-)
 mode change 100755 => 100644 paddle/cinn/ir/lowered_func.h
 create mode 100644 paddle/cinn/pybind/ir/ir.cc
 create mode 100644 paddle/cinn/pybind/ir/ir.h
 rename paddle/cinn/pybind/{ir.cc => ir/ir_api.cc} (85%)
 create mode 100644 paddle/cinn/pybind/ir/ir_context.cc
 create mode 100644 paddle/cinn/pybind/ir/ir_context.h
 mode change 100755 => 100644 paddle/cinn/runtime/cinn_runtime.h

diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
index ec5f4b2e64ce6..410ac068df85d 100644
--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -64,6 +64,16 @@ LoweredFunc _LoweredFunc_::Make(const std::string& name,
   return LoweredFunc(n);
 }
 
+LoweredFunc _LoweredFunc_::Make(const std::string& name,
+                                const std::vector<Argument>& args,
+                                const Expr& body) {
+  auto* n = make_shared<_LoweredFunc_>();
+  n->name = name;
+  n->args = args;
+  n->body = body;
+  return LoweredFunc(n);
+}
+
 void _LoweredFunc_::CheckValid() const {
   // check there is at least one output
   int out_count = 0;
diff --git a/paddle/cinn/ir/lowered_func.h b/paddle/cinn/ir/lowered_func.h
old mode 100755
new mode 100644
index 03ffacad817bd..b305f84506fe4
--- a/paddle/cinn/ir/lowered_func.h
+++ b/paddle/cinn/ir/lowered_func.h
@@ -30,8 +30,10 @@ class _LoweredFunc_;
  * the function signature of generated code.
  */
 struct Argument {
-  //! Input or output.
-  enum class IO { kInput = 0, kOutput = 1 };
+  //! kInput: arg is input
+  //! kOutput: arg is output
+  //! kUnknown: arg maybe input or output
+  enum class IO { kInput = 0, kOutput = 1, kUnknown = 2 };
 
   IO io{IO::kInput};
 
@@ -164,6 +166,13 @@ struct _LoweredFunc_ : ExprNode<_LoweredFunc_> {
                           const Expr& body,
                           const std::vector<ir::Buffer>& temp_bufs);
 
+  // A simple version of the make function method,
+  // regardless of the argument buffer information and IO information of
+  // Argument, after building the function to optimize the buffer through pass
+  static LoweredFunc Make(const std::string& name,
+                          const std::vector<Argument>& args,
+                          const Expr& body);
+
   bool is_gpu_host() const { return cuda_axis_info.valid(); }
 
   void Verify() const override {}
diff --git a/paddle/cinn/ir/module.cc b/paddle/cinn/ir/module.cc
index d52ee148b8bc8..fa791dcdbcd62 100644
--- a/paddle/cinn/ir/module.cc
+++ b/paddle/cinn/ir/module.cc
@@ -54,6 +54,8 @@ void Module::Builder::Clear() {
   module_->submodules.clear();
 }
 
+Target::Arch Module::Builder::GetTargetArch() { return module_->target.arch; }
+
 Module Module::Builder::Build() {
   if (module_->functions.empty()) {
     VLOG(1) << "Module has no functions";
diff --git a/paddle/cinn/ir/module.h b/paddle/cinn/ir/module.h
index 9d2b361083071..a057c4862cc0e 100644
--- a/paddle/cinn/ir/module.h
+++ b/paddle/cinn/ir/module.h
@@ -45,6 +45,7 @@ class Module : public ir::IrNodeRef {
     void AddFunctionWithoutOptim(const ir::LoweredFunc& func);
     void AddBuffer(ir::Buffer buffer);
     void Clear();
+    Target::Arch GetTargetArch();
 
     Module Build();
 
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index 8ad8b9878d4bc..ca7147db69249 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -53,6 +53,23 @@ Tensor _Tensor_::Make(const std::string &name,
 
   return Tensor(n);
 }
+Tensor _Tensor_::Make(const std::string &name,
+                      Type dtype,
+                      const std::vector<Expr> &shape,
+                      const std::vector<Expr> &domain,
+                      const std::vector<Var> &reduce_axis) {
+  CHECK(!name.empty()) << "Cannot set empty Tensor name in Tensor::Make";
+  auto n = make_shared<_Tensor_>();
+  n->name = name;
+  n->shape = shape;
+  n->domain = domain;
+  n->reduce_axis = reduce_axis;
+  n->operation = PlaceholderOp::Make(n->name, n->shape, Float(32));
+  n->set_type(dtype);
+  n->InitAxis();
+
+  return Tensor(n);
+}
 
 size_t Tensor::ndims() const { return operator->()->shape.size(); }
 
diff --git a/paddle/cinn/ir/tensor.h b/paddle/cinn/ir/tensor.h
index 5c252d35faceb..56995559dba94 100644
--- a/paddle/cinn/ir/tensor.h
+++ b/paddle/cinn/ir/tensor.h
@@ -149,6 +149,13 @@ class _Tensor_ : public ExprNode<_Tensor_> {
                      FunctionRef fn,
                      const std::vector<Var>& reduce_axis = {});
 
+  // Manual tensor construction, no FunctionRef information
+  static Tensor Make(const std::string& name,
+                     Type dtype,
+                     const std::vector<Expr>& shape,
+                     const std::vector<Expr>& domain,
+                     const std::vector<Var>& reduce_axis = {});
+
   void Verify() const override;
 
   bool IsReduceInited(poly::StageMap stages) const;
diff --git a/paddle/cinn/ir/utils/ir_compare.cc b/paddle/cinn/ir/utils/ir_compare.cc
index 87324be608048..fbe7a65c43efc 100644
--- a/paddle/cinn/ir/utils/ir_compare.cc
+++ b/paddle/cinn/ir/utils/ir_compare.cc
@@ -29,6 +29,10 @@ bool IrEqualVisitor::Compare(const Expr& lhs, const Expr& rhs) {
     return true;
   }
 
+  if (only_compare_structure_ && !lhs.defined() && !rhs.defined()) {
+    return true;
+  }
+
   if (!lhs.defined() || !rhs.defined()) {  // someone invalid
     return false;
     VLOG(5) << "Not equal on Expr, someone not defined";
@@ -46,10 +50,9 @@ bool IrEqualVisitor::Compare(const Expr& lhs, const Expr& rhs) {
   return equal;
 }
 
-bool IrEqualVisitor::Compare(const std::string& lhs,
-                             const std::string& rhs,
-                             bool allow_name_suffix_diff) {
-  // if allow_name_suffix_diff=true then just compare the name prefix before the
+bool IrEqualVisitor::Compare(const std::string& lhs, const std::string& rhs) {
+  // if allow_name_suffix_diff_=true then just compare the name prefix before
+  // the
   // "_[0-9]+"
   auto common_len = 0;
   for (; common_len < lhs.size() && common_len < rhs.size(); ++common_len) {
@@ -67,7 +70,7 @@ bool IrEqualVisitor::Compare(const std::string& lhs,
     equal = true;
   } else {
     equal = false;
-    if (allow_name_suffix_diff) {
+    if (allow_name_suffix_diff_) {
       equal = is_endswith_index(lhs) && is_endswith_index(rhs);
     }
   }
@@ -181,17 +184,26 @@ bool IrEqualVisitor::Visit(const Block* lhs, const Expr* other) {
 
 bool IrEqualVisitor::Visit(const Call* lhs, const Expr* other) {
   auto* rhs = other->As<Call>();
-  return lhs->name == rhs->name && Compare(lhs->read_args, rhs->read_args) &&
-         Compare(lhs->write_args, rhs->write_args) &&
-         Compare(lhs->attrs, rhs->attrs) && lhs->call_type == rhs->call_type;
+  bool flag = Compare(lhs->read_args, rhs->read_args) &&
+              Compare(lhs->write_args, rhs->write_args) &&
+              Compare(lhs->attrs, rhs->attrs) &&
+              lhs->call_type == rhs->call_type;
+  if (only_compare_structure_) {
+    return flag;
+  }
+  return lhs->name == rhs->name && flag;
   // TODO(CtfGo): Compare `func` field
 }
 
 bool IrEqualVisitor::Visit(const _Var_* lhs, const Expr* other) {
   auto* rhs = other->As<_Var_>();
-  return lhs->name == rhs->name &&
-         Compare(lhs->lower_bound, rhs->lower_bound) &&
-         Compare(lhs->upper_bound, rhs->upper_bound) && lhs->tag == rhs->tag;
+  bool flag = Compare(lhs->lower_bound, rhs->lower_bound) &&
+              Compare(lhs->upper_bound, rhs->upper_bound) &&
+              lhs->tag == rhs->tag;
+  if (only_compare_structure_) {
+    return flag;
+  }
+  return lhs->name == rhs->name && flag;
 }
 
 bool IrEqualVisitor::Visit(const Load* lhs, const Expr* other) {
@@ -221,19 +233,25 @@ bool IrEqualVisitor::Visit(const Free* lhs, const Expr* other) {
 
 bool IrEqualVisitor::Visit(const _Buffer_* lhs, const Expr* other) {
   auto* rhs = other->As<_Buffer_>();
-  return Compare(lhs->shape, rhs->shape) &&
-         Compare(lhs->strides, rhs->strides) && lhs->name == rhs->name &&
-         lhs->scope == rhs->scope &&
-         Compare(lhs->elem_offset, rhs->elem_offset) &&
-         lhs->offset_factor == rhs->offset_factor &&
-         lhs->target == rhs->target &&
-         lhs->data_alignment == rhs->data_alignment &&
-         lhs->memory_type == rhs->memory_type && lhs->dtype == rhs->dtype;
+  bool flag =
+      Compare(lhs->shape, rhs->shape) && Compare(lhs->strides, rhs->strides) &&
+      lhs->scope == rhs->scope && Compare(lhs->elem_offset, rhs->elem_offset) &&
+      lhs->offset_factor == rhs->offset_factor && lhs->target == rhs->target &&
+      lhs->data_alignment == rhs->data_alignment &&
+      lhs->memory_type == rhs->memory_type && lhs->dtype == rhs->dtype;
+  if (only_compare_structure_) {
+    return flag;
+  }
+  return flag && lhs->name == rhs->name;
 }
 
 bool IrEqualVisitor::Visit(const _Tensor_* lhs, const Expr* other) {
   auto* rhs = other->As<_Tensor_>();
-  return lhs->name == rhs->name && Compare(lhs->shape, rhs->shape);
+  bool flag = Compare(lhs->shape, rhs->shape);
+  if (only_compare_structure_) {
+    return flag;
+  }
+  return flag && Compare(lhs->name, rhs->name);
 }
 
 bool IrEqualVisitor::Visit(const _LoweredFunc_* lhs, const Expr* other) {
@@ -282,10 +300,15 @@ bool IrEqualVisitor::Visit(const _LoweredFunc_* lhs, const Expr* other) {
 
 bool IrEqualVisitor::Visit(const _Module_* lhs, const Expr* other) {
   auto* rhs = other->As<_Module_>();
-  return lhs->name == rhs->name && lhs->target == rhs->target &&
-         Compare(lhs->buffers, rhs->buffers) &&
-         Compare(lhs->functions, rhs->functions) &&
-         Compare(lhs->submodules, rhs->submodules);
+  bool flag = Compare(lhs->buffers, rhs->buffers) &&
+              Compare(lhs->functions, rhs->functions) &&
+              Compare(lhs->submodules, rhs->submodules);
+
+  if (only_compare_structure_) {
+    return flag;
+  }
+
+  return flag && lhs->name == rhs->name;
 }
 
 bool IrEqualVisitor::Visit(const Let* lhs, const Expr* other) {
@@ -347,11 +370,16 @@ bool IrEqualVisitor::Visit(const _BufferRange_* lhs, const Expr* other) {
 
 bool IrEqualVisitor::Visit(const ScheduleBlock* lhs, const Expr* other) {
   auto* rhs = other->As<ScheduleBlock>();
-  return Compare(lhs->name, rhs->name, allow_name_suffix_diff_) &&
-         Compare(lhs->iter_vars, rhs->iter_vars) &&
-         Compare(lhs->read_buffers, rhs->read_buffers) &&
-         Compare(lhs->write_buffers, rhs->write_buffers) &&
-         Compare(lhs->attrs, rhs->attrs) && Compare(lhs->body, rhs->body);
+  bool flag = Compare(lhs->iter_vars, rhs->iter_vars) &&
+              Compare(lhs->read_buffers, rhs->read_buffers) &&
+              Compare(lhs->write_buffers, rhs->write_buffers) &&
+              Compare(lhs->body, rhs->body);
+
+  if (only_compare_structure_) {
+    return flag;
+  }
+  return flag && Compare(lhs->attrs, rhs->attrs) &&
+         Compare(lhs->name, rhs->name);
 }
 
 bool IrEqualVisitor::Visit(const ScheduleBlockRealize* lhs, const Expr* other) {
diff --git a/paddle/cinn/ir/utils/ir_compare.h b/paddle/cinn/ir/utils/ir_compare.h
index d41e6db0441a7..03ec82c246750 100644
--- a/paddle/cinn/ir/utils/ir_compare.h
+++ b/paddle/cinn/ir/utils/ir_compare.h
@@ -26,15 +26,15 @@ namespace ir_utils {
 // fields of each node through dfs visitor
 class IrEqualVisitor : public IRVisitorRequireReImpl<bool, const Expr*> {
  public:
-  explicit IrEqualVisitor(bool allow_name_suffix_diff = false)
-      : allow_name_suffix_diff_(allow_name_suffix_diff) {}
+  explicit IrEqualVisitor(bool allow_name_suffix_diff = false,
+                          bool only_compare_structure = false)
+      : allow_name_suffix_diff_(allow_name_suffix_diff),
+        only_compare_structure_(only_compare_structure) {}
   // Return true if they are euqal, otherwise false;
   bool Compare(const Expr& lhs, const Expr& rhs);
 
  private:
-  bool Compare(const std::string& lhs,
-               const std::string& rhs,
-               bool allow_name_suffix_diff = false);
+  bool Compare(const std::string& lhs, const std::string& rhs);
   bool Compare(const std::map<std::string, attr_t>& lhs,
                const std::map<std::string, attr_t>& rhs);
   template <typename T>
@@ -46,6 +46,8 @@ class IrEqualVisitor : public IRVisitorRequireReImpl<bool, const Expr*> {
 
   // whether allowing name suffix ends with "_[0-9]+" different
   bool allow_name_suffix_diff_ = false;
+  // not compare name field of Expr
+  bool only_compare_structure_ = false;
 };
 
 bool IRCompare(const Expr& lhs,
diff --git a/paddle/cinn/pybind/CMakeLists.txt b/paddle/cinn/pybind/CMakeLists.txt
index bf6e3d095377f..c00a64614f643 100755
--- a/paddle/cinn/pybind/CMakeLists.txt
+++ b/paddle/cinn/pybind/CMakeLists.txt
@@ -2,7 +2,9 @@ set(srcs
     runtime.cc
     common.cc
     lang.cc
-    ir.cc
+    ir/ir.cc
+    ir/ir_api.cc
+    ir/ir_context.cc
     poly.cc
     backends.cc
     bind.cc
diff --git a/paddle/cinn/pybind/common.cc b/paddle/cinn/pybind/common.cc
index 170ebfc6d6916..bdb4b46c848ef 100644
--- a/paddle/cinn/pybind/common.cc
+++ b/paddle/cinn/pybind/common.cc
@@ -94,6 +94,7 @@ void BindTarget(py::module *m) {
 void BindType(py::module *m) {
   py::class_<Type> type(*m, "Type");
   type.def(py::init<>())
+      .def(py::init<Type &>())
       .def(py::init<Type::type_t, int, int, Type::specific_type_t>());
 #define DEFINE_TYPE_METHOD(__name) (type = type.def(#__name, &Type::__name))
   DEFINE_TYPE_METHOD(is_primitive);
@@ -140,7 +141,7 @@ void BindType(py::module *m) {
       .export_values();
 
   py::enum_<Type::specific_type_t> specific_type_t(type, "specific_type_t");
-  specific_type_t.value("None", Type::specific_type_t::None)
+  specific_type_t.value("UNK", Type::specific_type_t::None)
       .value("FP16", Type::specific_type_t::FP16)
       .value("BF16", Type::specific_type_t::BF16)
       .export_values();
diff --git a/paddle/cinn/pybind/ir/ir.cc b/paddle/cinn/pybind/ir/ir.cc
new file mode 100644
index 0000000000000..f569bd2c973be
--- /dev/null
+++ b/paddle/cinn/pybind/ir/ir.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/pybind/ir/ir.h"
+#include "paddle/cinn/pybind/ir/ir_context.h"
+namespace cinn {
+namespace pybind {
+void TensorStore(Expr tensor, Expr value, const std::vector<Expr>& indices) {
+  // TODO(6clc): Check the compatibility of data types for tensor and value
+  IRContext find_sch_block =
+      IRBuilder::CurrentIRBuilder()
+          .data_->FindContext<ScheduleBlockContextNode>();
+  if (!find_sch_block.data_.defined()) {
+    IRContext sch_block(new ScheduleBlockContextNode());
+    sch_block.data_->EnterWithContext();
+    LinkToParentContext(ir::Store::Make(tensor, value, indices));
+    sch_block.data_->ExitWithContext();
+    return;
+  }
+  LinkToParentContext(ir::Store::Make(tensor, value, indices));
+}
+std::vector<Expr> AxisMap(const std::string& kinds,
+                          const std::vector<Expr>& iter_expression) {
+  std::vector<Expr> rets;
+  CHECK_EQ(kinds.size(), iter_expression.size());
+  int n = iter_expression.size();
+  rets.reserve(n);
+  for (int i = 0; i < n; i++) {
+    char c = kinds.c_str()[i];
+
+    // TODO(6clc): set bound of IterVar
+
+    Var iter_var = ir::_Var_::Make("iter_tmp", common::Int(32));
+    if (c == 'S') {
+      iter_var->is_reduce_axis = false;
+    } else if (c == 'R') {
+      iter_var->is_reduce_axis = true;
+    } else {
+      LOG(FATAL)
+          << "kind of axis setting error, must be R(Reduce) or S(Spatial)";
+    }
+    rets.push_back(SetScheduleBlockIterVar(iter_var, iter_expression[i]));
+  }
+  return rets;
+}
+Var SetScheduleBlockIterVar(Var iter_var, Expr expr) {
+  IRContext cur_context =
+      IRBuilder::CurrentIRBuilder()
+          .data_->GetLastContext<ScheduleBlockContextNode>();
+  ScheduleBlockContextNode* cur_context_node =
+      cur_context.As<ScheduleBlockContextNode>();
+  cur_context_node->iter_vars.push_back(iter_var);
+  cur_context_node->iter_values.push_back(expr);
+  return iter_var.operator Expr();
+}
+
+Expr Arg(const std::string& name, Var var) {
+  IRContext ctx =
+      IRBuilder::CurrentIRBuilder().data_->FindContext<LowerFuncContextNode>();
+  var->name = name;
+  ctx.As<LowerFuncContextNode>()->args.emplace_back(var,
+                                                    ir::Argument::IO::kUnknown);
+  return var.operator Expr();
+}
+
+Expr Arg(const std::string& name, ir::Buffer buffer) {
+  IRContext ctx =
+      IRBuilder::CurrentIRBuilder().data_->FindContext<LowerFuncContextNode>();
+  buffer->name = "_" + name;
+  // TODO(6clc): Unify cinn compilation and runtime Type,
+  //  and add a Handle type to Var
+  ctx.As<LowerFuncContextNode>()->args.emplace_back(buffer,
+                                                    ir::Argument::IO::kUnknown);
+  return buffer.operator Expr();
+}
+
+IRContext Sequential(Expr min, Expr extent) {
+  ForContextNode* for_ctx_node = new ForContextNode();
+  for_ctx_node->min = min;
+  for_ctx_node->extent = extent;
+  for_ctx_node->loop_var = ir::_Var_::Make("v", common::Int(32));
+  return IRContext(for_ctx_node);
+}
+
+}  // namespace pybind
+
+}  // namespace cinn
diff --git a/paddle/cinn/pybind/ir/ir.h b/paddle/cinn/pybind/ir/ir.h
new file mode 100644
index 0000000000000..9a4e2e2263f0e
--- /dev/null
+++ b/paddle/cinn/pybind/ir/ir.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/pybind/ir/ir_context.h"
+namespace cinn {
+namespace pybind {
+
+template IRContext IRBuilderNode::GetLastContext<ScheduleBlockContextNode>()
+    const;
+Var SetScheduleBlockIterVar(Var iter_var, Expr expr);
+std::vector<Expr> AxisMap(const std::string &kinds,
+                          const std::vector<Expr> &iter_expression);
+void TensorStore(Expr tensor, Expr value, const std::vector<Expr> &indices);
+Expr Arg(const std::string &name, Var var);
+Expr Arg(const std::string &name, ir::Buffer buffer);
+IRContext Sequential(Expr min, Expr extent);
+}  // namespace pybind
+}  // namespace cinn
diff --git a/paddle/cinn/pybind/ir.cc b/paddle/cinn/pybind/ir/ir_api.cc
similarity index 85%
rename from paddle/cinn/pybind/ir.cc
rename to paddle/cinn/pybind/ir/ir_api.cc
index b03b7181509d8..66c0e2306d8cc 100644
--- a/paddle/cinn/pybind/ir.cc
+++ b/paddle/cinn/pybind/ir/ir_api.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/ir/ir.h"
-
 #include <llvm/Support/FormatVariadic.h>
 #include <pybind11/functional.h>
 #include <pybind11/operators.h>
@@ -22,21 +20,29 @@
 #include <string>
 #include <type_traits>
 
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/lowered_func.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/operation.h"
 #include "paddle/cinn/ir/registry.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_compare.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
 #include "paddle/cinn/lang/packed_func.h"
 #include "paddle/cinn/poly/stage.h"
 #include "paddle/cinn/pybind/bind.h"
 #include "paddle/cinn/pybind/bind_utils.h"
+#include "paddle/cinn/pybind/ir/ir.h"
+#include "paddle/cinn/pybind/ir/ir_context.h"
 
 namespace py = pybind11;
 
+PYBIND11_DECLARE_HOLDER_TYPE(T, cinn::common::Shared<T>);
+
 namespace cinn::pybind {
 using ir::IrNode;
 using ir::IrNodeRef;
@@ -62,7 +68,8 @@ void BindLoweredFunc(py::module *m) {
 
   py::enum_<Argument::IO> io(argument, "IO");
   io.value("kInput", Argument::IO::kInput)
-      .value("kOutput", Argument::IO::kOutput);
+      .value("kOutput", Argument::IO::kOutput)
+      .value("kUnknown", Argument::IO::kUnknown);
 
   argument
       .def(py::init<const ir::Buffer &, Argument::IO>(),
@@ -93,10 +100,12 @@ void BindLoweredFunc(py::module *m) {
            [](const ir::LoweredFunc &self) -> std::string {
              return utils::GetStreamCnt(Expr(self));
            })
-      .def("__repr__", [](const ir::LoweredFunc &self) -> std::string {
-        return llvm::formatv(
-            "<LoweredFunc {0}>", self.get(), self->name.c_str());
-      });
+      .def("__repr__",
+           [](const ir::LoweredFunc &self) -> std::string {
+             return llvm::formatv(
+                 "<LoweredFunc {0}>", self.get(), self->name.c_str());
+           })
+      .def("body", [](const ir::LoweredFunc &self) { return self->body; });
 }
 
 void BindNode(py::module *m) {
@@ -258,6 +267,13 @@ void BindNode(py::module *m) {
 
 // empty visitor
 void BindIrVisitor(py::module *m) {
+  py::class_<ir::ir_utils::IrEqualVisitor> ir_compare(*m, "IrCompare");
+  ir_compare.def(py::init<bool, bool>())
+      .def("compare",
+           [](ir::ir_utils::IrEqualVisitor &self,
+              const cinn::ir::Expr &lhs,
+              const cinn::ir::Expr &rhs) { return self.Compare(lhs, rhs); });
+
   py::class_<ir::IRVisitor> ir_visitor(*m, "IRVisitor");
   ir_visitor.def(py::init<>())
       .def("visit", py::overload_cast<const ir::Expr *>(&ir::IRVisitor::Visit));
@@ -466,6 +482,7 @@ void BindIrIr(py::module *m) {
       .def(py::init<Expr, Expr, const std::string &>())
       .def(py::init<int, const std::string &>())
       .def(py::init<Expr, const std::string &>())
+      .def("rename", [](Var &self, std::string &name) { self->name = name; })
       .def("get_mutable",
            py::overload_cast<>(&Var::get),
            py::return_value_policy::reference)
@@ -537,6 +554,31 @@ void BindIrIr(py::module *m) {
       .def_readwrite("buffers", &ir::_Module_::buffers)
       .def_readwrite("functions", &ir::_Module_::functions)
       .def_readwrite("submodules", &ir::_Module_::submodules);
+
+  DefineExprNode<ir::_Buffer_>(m, "_Buffer_");
+  py::class_<ir::_Buffer_, ir::ExprNode<ir::_Buffer_>> _buffer_(*m, "_Buffer_");
+  _buffer_
+      .def_static(
+          "make",
+          py::overload_cast<const std::string &, Type>(&ir::_Buffer_::Make))
+      .def_static(
+          "make",
+          py::overload_cast<const std::string &, const std::vector<Expr> &>(
+              &ir::_Buffer_::Make));
+  py::class_<ir::Buffer> buffer(*m, "Buffer");
+  buffer.def(py::init<>());
+
+  py::class_<ir::ModuleExpr> module_expr(*m, "ModuleExpr");
+  module_expr.def(py::init<const std::vector<Expr> &>());
+
+  DefineExprNode<ir::IfThenElse>(m, "IfThenElse");
+  py::class_<ir::IfThenElse> if_then_else(*m, "IfThenElse");
+  if_then_else.def_static(
+      "make",
+      py::overload_cast<Expr, Expr, Expr>(&ir::IfThenElse::Make),
+      py::arg("condition"),
+      py::arg("true_case"),
+      py::arg("false_case") = ir::Expr());
 }
 
 void BindOperation(py::module *m) {
@@ -586,9 +628,24 @@ void BindIrTensor(py::module *m) {
            [](ir::Tensor &self, Expr a, Expr b, Expr c) {
              return self(a, b, c);
            })
-      .def("__call__", [](ir::Tensor &self, Expr a, Expr b, Expr c, Expr d) {
-        return self(a, b, c, d);
-      });
+      .def("__call__",
+           [](ir::Tensor &self, Expr a, Expr b, Expr c, Expr d) {
+             return self(a, b, c, d);
+           })
+      .def("__getitem__", [](ir::Tensor &self, Expr a) { return self(a); })
+      .def("__getitem__",
+           [](ir::Tensor &self, Expr a, Expr b) { return self(a, b); })
+      .def("__getitem__",
+           [](ir::Tensor &self, Expr a, Expr b, Expr c) {
+             return self(a, b, c);
+           })
+      .def("__getitem__",
+           [](ir::Tensor &self, Expr a, Expr b, Expr c, Expr d) {
+             return self(a, b, c, d);
+           })
+      .def("__getitem__",
+           [](ir::Tensor &self, std::vector<Expr> idx) { return self(idx); })
+      .def("Expr", [](ir::Tensor &self) { return self.operator Expr(); });
 
   DefineExprNode<ir::_Tensor_>(m, "_Tensor_");
   py::class_<ir::_Tensor_, ir::ExprNode<ir::_Tensor_>> _tensor_(*m, "_Tensor_");
@@ -600,7 +657,18 @@ void BindIrTensor(py::module *m) {
       .def("domain_with_reduce_axis", &ir::_Tensor_::domain_without_reduce_axis)
       .def("domain_without_reduce_axis",
            &ir::_Tensor_::domain_without_reduce_axis)
-      .def_static("make", &ir::_Tensor_::Make)
+      .def_static(
+          "make",
+          py::overload_cast<const std::string &,
+                            Type,
+                            const std::vector<Expr> &,
+                            const std::vector<Expr> &,
+                            const std::vector<Var> &>(&ir::_Tensor_::Make),
+          py::arg("name"),
+          py::arg("dtype"),
+          py::arg("shape"),
+          py::arg("domain"),
+          py::arg("reduce_axis") = std::vector<Var>({}))
       .def("is_tuple", &ir::_Tensor_::is_tuple)
       .def("is_tuple_get", &ir::_Tensor_::is_tuple_get)
       .def("tuple_get", &ir::_Tensor_::TupleGet)
@@ -741,6 +809,54 @@ void BindRegistry(py::module *m) {
       });
 #endif
 }
+
+void BindIrContext(py::module *m) {
+  using ir::Expr;
+  using ir::IrNode;
+  using ir::IrNodeRef;
+  using ir::Var;
+  using py::arg;
+
+  py::class_<IRContext> ir_ctx(*m, "IRContext");
+  ir_ctx.def(py::init<>())
+      .def(py::init<IRContextNode *>())
+      .def("EnterWithContext",
+           [](IRContext &self) { self.data_->EnterWithContext(); })
+      .def("ExitWithContext",
+           [](IRContext &self) { self.data_->ExitWithContext(); })
+      .def("get_for_loop_var",
+           [](IRContext &self) {
+             return self.data_->safe_as<ForContextNode>()->loop_var;
+           })
+      .def_static("MakeLowerFunctionContext",
+                  [](std::string &name) {
+                    return IRContext(new LowerFuncContextNode(name));
+                  })
+      .def_static("MakeScheduleBlockContext",
+                  [](std::string &name) {
+                    return IRContext(new ScheduleBlockContextNode(name));
+                  })
+      .def_static("MakeIfContext",
+                  [](Expr expr) { return IRContext(new IfContextNode(expr)); })
+      .def_static("MakeElseContext",
+                  []() { return IRContext(new ElseContextNode()); })
+      .def_static("MakeThenContext",
+                  []() { return IRContext(new ThenContextNode()); });
+
+  py::class_<IRBuilder> ir_builder(*m, "IRBuilder");
+  ir_builder.def(py::init<>())
+      .def("EnterWithContext", &IRBuilder::EnterWithContext)
+      .def("ExitWithContext", &IRBuilder::ExitWithContext)
+      .def("get_result", [](IRBuilder &self) {
+        return self.data_->GetResult().as_lowered_func_ref();
+      });
+
+  m->def("AxisMap", &AxisMap);
+  m->def("TensorStore", &TensorStore);
+  m->def("Arg", py::overload_cast<const std::string &, Var>(&Arg));
+  m->def("Arg", py::overload_cast<const std::string &, ir::Buffer>(&Arg));
+  m->def("Sequential", py::overload_cast<Expr, Expr>(&Sequential));
+}
 }  // namespace
 
 void BindIr(py::module *m) {
@@ -750,6 +866,7 @@ void BindIr(py::module *m) {
   BindIrVisitor(m);
   BindIrIr(m);
   BindIrTensor(m);
+  BindIrContext(m);
   BindPackedFunc(m);
   BindRegistry(m);
 }
diff --git a/paddle/cinn/pybind/ir/ir_context.cc b/paddle/cinn/pybind/ir/ir_context.cc
new file mode 100644
index 0000000000000..8af89d974222f
--- /dev/null
+++ b/paddle/cinn/pybind/ir/ir_context.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/pybind/ir/ir_context.h"
+#include "paddle/cinn/ir/ir.h"
+
+namespace cinn {
+namespace pybind {
+void IRContextNode::EnterWithContext() {
+  IRBuilder::CurrentIRBuilder().data_->contexts.emplace_back(this);
+}
+void IRContextNode::ExitWithContext() {
+  IRBuilder::CurrentIRBuilder().data_->contexts.pop_back();
+}
+
+void ScheduleBlockContextNode::ExitWithContext() {
+  IRContextNode::ExitWithContext();
+  ir::Expr schedule_block = ir::ScheduleBlock::Make(
+      iter_vars, read_buffers, write_buffers, name, ir::Block::Make(exprs));
+
+  ir::Expr schedule_block_realize =
+      ir::ScheduleBlockRealize::Make(iter_values, schedule_block);
+  LinkToParentContext(schedule_block_realize);
+}
+
+void ForContextNode::ExitWithContext() {
+  IRContextNode::ExitWithContext();
+  LinkToParentContext(ir::For::Make(loop_var,
+                                    min,
+                                    extent,
+                                    ir::ForType::Serial,
+                                    ir::DeviceAPI::UNK,
+                                    ir::Block::Make(exprs)));
+}
+
+void LowerFuncContextNode::ExitWithContext() {
+  IRContextNode::ExitWithContext();
+  // TODO(6clc): implement Private Fields for intrinstic function, like
+  // allreduce
+  Expr body = ir::ScheduleBlockRealize::Make(
+      {}, ir::ScheduleBlock::Make({}, {}, {}, "root", ir::Block::Make(exprs)));
+  ir::LoweredFunc lower_func =
+      ir::_LoweredFunc_::Make(name, args, ir::Block::Make({body}));
+  IRBuilder ir_builder = IRBuilder::CurrentIRBuilder();
+  ir_builder.data_->result = lower_func.operator Expr();
+}
+
+void IfContextNode::ExitWithContext() {
+  IRContextNode::ExitWithContext();
+  if (!exprs.empty()) {
+    LOG(FATAL) << "Expr not be either in ThenBlock or ElseBlock in if";
+  }
+  if (!true_case.defined()) {
+    LOG(FATAL) << "Expr not be defined in ThenBlock";
+  }
+  LinkToParentContext(ir::IfThenElse::Make(condition, true_case, false_case));
+}
+
+void ThenContextNode::ExitWithContext() {
+  IRContextNode::ExitWithContext();
+  IRContext for_ctx =
+      IRBuilder::CurrentIRBuilder().data_->GetLastContext<IfContextNode>();
+  for_ctx.data_->safe_as<IfContextNode>()->true_case = ir::Block::Make(exprs);
+}
+
+void ElseContextNode::ExitWithContext() {
+  IRContextNode::ExitWithContext();
+  IRContext for_ctx =
+      IRBuilder::CurrentIRBuilder().data_->GetLastContext<IfContextNode>();
+  for_ctx.data_->safe_as<IfContextNode>()->false_case = ir::Block::Make(exprs);
+}
+
+Expr IRBuilderNode::GetResult() const {
+  CHECK(result.defined()) << "No result generated in IRBuilder";
+  return result;
+}
+
+void IRBuilderNode::Reset() {
+  contexts.clear();
+  result.Reset();
+}
+
+IRBuilder::IRBuilder() {
+  common::Shared<IRBuilderNode> n(new IRBuilderNode());
+  n->Reset();
+  data_ = n;
+}
+
+void IRBuilder::EnterWithContext() {
+  CHECK(data_->contexts.empty())
+      << "There are still Contexts in IRBuilder that has not been fully "
+         "converted. Please build a new IR with the new IRbuilder";
+  data_->result.Reset();
+  std::vector<IRBuilder>* st = IRBuilderStack();
+  st->push_back(*this);
+}
+
+void IRBuilder::ExitWithContext() {
+  std::vector<IRBuilder>* st = IRBuilderStack();
+  CHECK(!st->empty());
+  st->pop_back();
+}
+IRBuilder IRBuilder::CurrentIRBuilder() {
+  std::vector<IRBuilder>* st = IRBuilderStack();
+  CHECK(!st->empty()) << "No IRBuilder Found";
+  return st->back();
+}
+std::vector<IRBuilder>* IRBuilderStack() {
+  thread_local std::vector<IRBuilder> stack;
+  return &stack;
+}
+void LinkToParentContext(ir::Expr expr) {
+  IRBuilder ir_builder = IRBuilder::CurrentIRBuilder();
+  if (ir_builder.data_->contexts.empty()) {
+    ir_builder.data_->result = expr;
+  } else {
+    IRContext ir_context = ir_builder.data_->contexts.back();
+    ir_context.add_expr(expr);
+  }
+}
+
+}  // namespace pybind
+}  // namespace cinn
diff --git a/paddle/cinn/pybind/ir/ir_context.h b/paddle/cinn/pybind/ir/ir_context.h
new file mode 100644
index 0000000000000..c96c423bb071e
--- /dev/null
+++ b/paddle/cinn/pybind/ir/ir_context.h
@@ -0,0 +1,256 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <vector>
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/shared.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/utils/error.h"
+
+namespace cinn {
+namespace pybind {
+
+/**
+ * A base context that represents the CINN IR that need context information
+ */
+class IRContextNode : public common::Object {
+ public:
+  std::vector<ir::Expr> exprs;
+
+ public:
+  // Corresponds to the __enter__ method in python's context manager
+  virtual void EnterWithContext();
+  // Corresponds to the __exit__ method in python's context manager
+  virtual void ExitWithContext();
+  const char* type_info() const override { return __type_info__; }
+
+ public:
+  static constexpr char* __type_info__ = "IRContextNode";
+};
+
+/**
+ * The lifecycle of RAII resource management for IRContextNode
+ * is determined at the Python.
+ */
+class IRContext {
+ public:
+  IRContext() = default;
+  IRContext(const IRContext& other) = default;
+  explicit IRContext(IRContextNode* x) : data_(x) {}
+
+  const IRContextNode* get() const { return data_.get(); }
+  const IRContextNode* operator->() const { return data_.get(); }
+
+  void add_expr(Expr expr) { data_->exprs.push_back(expr); }
+
+ public:
+  common::Shared<IRContextNode> data_;
+
+ public:
+  template <typename TIRContextNode>
+  const TIRContextNode* As() const {
+    static_assert(std::is_base_of<IRContextNode, TIRContextNode>());
+    CHECK(data_.get()) << "IrContext holds null";
+    auto* ctx_node = data_.get()->safe_as<TIRContextNode>();
+    if (!ctx_node) {
+      std::stringstream err_msg;
+      err_msg << "TypeConvertError: convert " << data_.get()->type_info()
+              << " to " << TIRContextNode::__type_info__;
+
+      CINN_THROW(err_msg.str());
+    }
+    return ctx_node;
+  }
+  template <typename TIRContextNode>
+  TIRContextNode* As() {
+    CHECK(data_.get()) << "IrContext holds null";
+    auto* ctx_node = data_.get()->safe_as<TIRContextNode>();
+    if (!ctx_node) {
+      LOG(FATAL) << "TypeConvertError: convert " << data_.get()->type_info()
+                 << " to " << TIRContextNode::__type_info__;
+    }
+    return ctx_node;
+  }
+};
+
+class ScheduleBlockContextNode : public IRContextNode {
+ public:
+  std::vector<Var> iter_vars;
+  // BufferRange(s) which is read in this schedule block, it is used to
+  // analyze, not a real computation expression. Must be AST DFS order.
+  std::vector<Expr> read_buffers;
+  // BufferRange(s) which is written in this schedule block, it is used to
+  // analyze, not a real computation expression. Must be AST DFS order.
+  std::vector<Expr> write_buffers;
+  // Additional attributes about this schedulable block,
+  // which take some auxiliary hints for future transformations.
+  std::map<std::string, ir::attr_t> attrs;
+  // values of the iter_vars
+  std::vector<Expr> iter_values;
+  std::string name;
+
+ public:
+  ScheduleBlockContextNode() = default;
+  explicit ScheduleBlockContextNode(std::string name) : name(name) {}
+  void ExitWithContext() final;
+  const char* type_info() const override { return __type_info__; }
+
+ public:
+  static constexpr const char* __type_info__ = "ScheduleBlockContextNode";
+};
+
+class ScheduleBlockContext : public IRContext {
+ public:
+  explicit ScheduleBlockContext(ScheduleBlockContextNode* x) : IRContext(x) {}
+};
+
+class ForContextNode : public IRContextNode {
+ public:
+  //! The loop variable.
+  Var loop_var;
+  //! The minimum value of the iteration.
+  Expr min;
+  //! The extent of the iteration.
+  Expr extent;
+
+ public:
+  void ExitWithContext() final;
+  const char* type_info() const override { return __type_info__; }
+
+ public:
+  static constexpr const char* __type_info__ = "ForContextNode";
+};
+
+class LowerFuncContextNode : public IRContextNode {
+ public:
+  //! The name of this function.
+  std::string name;
+  //! The Arguments used in the body of the function.
+  std::vector<ir::Argument> args;
+
+ public:
+  LowerFuncContextNode() = default;
+  explicit LowerFuncContextNode(std::string name) : name(name) {}
+  void ExitWithContext() final;
+  const char* type_info() const override { return __type_info__; }
+
+ public:
+  static constexpr const char* __type_info__ = "LowerFuncContextNode";
+};
+
+class IfContextNode : public IRContextNode {
+ public:
+  Expr condition;
+  Expr true_case;
+  Expr false_case;
+
+ public:
+  IfContextNode() = default;
+  explicit IfContextNode(Expr condition)
+      : condition(condition), true_case(Expr()), false_case(Expr()) {}
+  const char* type_info() const override { return __type_info__; }
+
+  void ExitWithContext() final;
+
+ public:
+  static constexpr const char* __type_info__ = "IfContextNode";
+};
+
+class ThenContextNode : public IRContextNode {
+ public:
+  ThenContextNode() = default;
+  const char* type_info() const override { return __type_info__; }
+
+  void ExitWithContext() final;
+
+ public:
+  static constexpr const char* __type_info__ = "ThenContextNode";
+};
+
+class ElseContextNode : public IRContextNode {
+ public:
+  ElseContextNode() = default;
+  const char* type_info() const override { return __type_info__; }
+  void ExitWithContext() final;
+
+ public:
+  static constexpr const char* __type_info__ = "ElseContextNode";
+};
+
+/**
+ * A stack used to store current IRContext
+ */
+class IRBuilderNode : public common::Object {
+ public:
+  std::vector<IRContext> contexts;
+  Expr result;
+  const char* type_info() const override { return __type_info__; }
+  Expr GetResult() const;
+  void Reset();
+
+  template <typename TIRContextNode>
+  IRContext GetLastContext() const;
+
+  template <typename TIRContextNode>
+  IRContext FindContext() const;
+
+ public:
+  static constexpr const char* __type_info__ = "IRBuilderNode";
+};
+
+/**
+ * The lifecycle of RAII resource management for IRBuilderNode
+ * is determined at the Python.
+ */
+class IRBuilder {
+ public:
+  IRBuilder();
+  void EnterWithContext();
+  void ExitWithContext();
+  static IRBuilder CurrentIRBuilder();
+
+ public:
+  common::Shared<IRBuilderNode> data_;
+};
+
+std::vector<IRBuilder>* IRBuilderStack();
+void LinkToParentContext(ir::Expr);
+
+template <typename TIRContextNode>
+IRContext IRBuilderNode::GetLastContext() const {
+  if (!(contexts.back().As<TIRContextNode>())) {
+    LOG(FATAL) << "TypeError: The last context is not "
+               << TIRContextNode::__type_info__;
+  }
+  return contexts.back();
+}
+
+template <typename TIRContextNode>
+IRContext IRBuilderNode::FindContext() const {
+  for (auto it = contexts.rbegin(); it != contexts.rend(); ++it) {
+    if (const TIRContextNode* p = it->As<TIRContextNode>()) {
+      return *it;
+    }
+  }
+  return IRContext();
+}
+
+}  // namespace pybind
+
+}  // namespace cinn
diff --git a/paddle/cinn/runtime/cinn_runtime.h b/paddle/cinn/runtime/cinn_runtime.h
old mode 100755
new mode 100644
index 39ed8cbe5ee09..17b5a400fd122
--- a/paddle/cinn/runtime/cinn_runtime.h
+++ b/paddle/cinn/runtime/cinn_runtime.h
@@ -128,7 +128,8 @@ typedef enum cinn_device_kind_t {
   cinn_unk_device = -1,    // Undefined device.
   cinn_x86_device = 0,     // X86 device
   cinn_opencl_device = 1,  // OpenCL device
-  cinn_arm_device = 2      // ARM device
+  cinn_arm_device = 2,     // ARM device
+  cinn_nvgpu_device = 3    // NVIDIA GPU device
 } cinn_device_kind_t;
 
 //! Help to tell where the buffer locates.

From 7e2e984961d3f61a379cadad0b0cdcbbb423c4be Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 22 Sep 2023 10:01:07 +0800
Subject: [PATCH 048/115] remove reduce_all attribute for reduce_sum (#57506)

* remove reduce_all attribute for reduce_sum

* update infermeta

* refine trt converter
---
 paddle/fluid/inference/tensorrt/convert/reduce_op.cc | 4 ++++
 paddle/phi/infermeta/unary.cc                        | 3 ++-
 python/paddle/tensor/math.py                         | 3 ++-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
index 16221851ae045..70ace9510a389 100644
--- a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -42,6 +42,10 @@ class ReduceOpConverter : public OpConverter {
         PADDLE_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("dim"));
     bool reduce_all = PADDLE_GET_CONST(bool, op_desc.GetAttr("reduce_all"));
 
+    if (dim.size() == 0) {
+      reduce_all = true;
+    }
+
     nvinfer1::IReduceLayer* layer = nullptr;
     if (reduce_all) {
       uint32_t reduce_dim = 0;
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index e0df80157013e..6eaff66c58389 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -3219,7 +3219,8 @@ DDim ReduceInferDim(const MetaTensor& x,
       break;
     }
   }
-  reduce_all = reduce_all || full_dim;
+  bool empty_dim = axis.size() == 0;
+  reduce_all = reduce_all || full_dim || empty_dim;
 
   std::vector<int64_t> out_dim_vector;
   for (int i = 0; i < x_rank; ++i) {
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 56c553bce797e..ce359b732e2c0 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1526,7 +1526,8 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         return _C_ops.sum(x, axis, dtype, keepdim)
     else:
         reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
-        attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+
+        attrs = {'dim': axis, 'keep_dim': keepdim}
 
         if dtype_flag:
             attrs.update({'in_dtype': x.dtype, 'out_dtype': dtype})

From f092028ae19756143e780140bbb77c2938c96c3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:14:38 +0800
Subject: [PATCH 049/115] [CodeStyle][task 1] enable Ruff UP032 rule with .
 except `python/paddle/base` (#57409)

* update up032

* update up032

* Update api_gen.py

* Update api_gen.py

* Update sampcd_processor_utils.py
---
 paddle/phi/api/yaml/generator/api_gen.py      |   4 +-
 .../api/yaml/generator/backward_api_gen.py    |   4 +-
 paddle/phi/api/yaml/generator/dist_api_gen.py |   8 +-
 .../phi/api/yaml/generator/dist_bw_api_gen.py |   8 +-
 .../phi/api/yaml/generator/sparse_api_gen.py  |   4 +-
 .../api/yaml/generator/sparse_bw_api_gen.py   |   4 +-
 .../phi/api/yaml/generator/strings_api_gen.py |   4 +-
 .../generate_kernels.py                       |  14 +-
 pyproject.toml                                | 324 ------------------
 python/paddle/amp/accuracy_compare.py         |  20 +-
 python/paddle/audio/backends/init_backend.py  |   4 +-
 python/paddle/audio/backends/wave_backend.py  |   4 +-
 python/paddle/batch.py                        |   2 +-
 python/paddle/dataset/common.py               |   4 +-
 python/paddle/device/cuda/__init__.py         |   4 +-
 .../distributed/auto_parallel/interface.py    |  16 +-
 .../auto_parallel/static/completion.py        |   8 +-
 .../auto_parallel/static/converter.py         |  16 +-
 .../auto_parallel/static/cost/base_cost.py    |  12 +-
 .../auto_parallel/static/cost/tensor_cost.py  |   8 +-
 .../auto_parallel/static/cost_model.py        |   8 +-
 .../auto_parallel/static/dist_context.py      |   6 +-
 .../auto_parallel/static/dist_op.py           |   4 +-
 .../auto_parallel/static/dist_tensor.py       |   8 +-
 .../auto_parallel/static/engine.py            |  20 +-
 .../distributed/auto_parallel/static/graph.py |   4 +-
 .../dist_check_finite_and_unscale.py          |   4 +-
 .../static/operators/dist_default.py          |  28 +-
 .../static/operators/dist_embedding.py        |  16 +-
 .../static/operators/dist_matmul.py           | 142 +++-----
 .../static/operators/dist_pnorm.py            |  24 +-
 .../static/operators/dist_reduce_sum_p.py     |  12 +-
 .../static/operators/dist_reshape.py          |  36 +-
 .../operators/dist_update_loss_scaling.py     |   4 +-
 .../auto_parallel/static/partitioner.py       |  12 +-
 .../auto_parallel/static/reshard.py           |  23 +-
 .../auto_parallel/static/tuner/algorithms.py  |   4 +-
 .../static/tuner/optimization_tuner.py        |   4 +-
 .../auto_parallel/static/tuner/recorder.py    |   4 +-
 .../static/tuner/rule_based_tuner.py          |  12 +-
 .../static/tuner/tunable_variable.py          |   8 +-
 .../distributed/auto_parallel/static/utils.py |  62 ++--
 .../distributed/auto_parallel/strategy.py     |   4 +-
 python/paddle/distributed/cloud_utils.py      |  22 +-
 .../paddle/distributed/communication/group.py |  12 +-
 .../communication/stream/gather.py            |   4 +-
 .../fleet/base/orthogonal_strategy.py         |   8 +-
 .../distributed/fleet/base/role_maker.py      |   8 +-
 .../distributed/fleet/base/strategy_group.py  |   4 +-
 .../distributed/fleet/base/util_factory.py    |   8 +-
 .../paddle/distributed/fleet/cloud_utils.py   |  14 +-
 .../distributed/fleet/elastic/manager.py      |  28 +-
 python/paddle/distributed/fleet/launch.py     |   4 +-
 .../paddle/distributed/fleet/launch_utils.py  |  60 +---
 .../distributed/fleet/layers/mpu/mp_layers.py |  12 +-
 .../distributed/fleet/layers/mpu/mp_ops.py    |  28 +-
 .../dygraph_sharding_optimizer.py             |  12 +-
 .../meta_optimizers/meta_optimizer_base.py    |   8 +-
 .../meta_optimizers/sharding/fp16_helper.py   |   4 +-
 .../fleet/meta_optimizers/sharding/utils.py   |  36 +-
 .../sharding/weight_decay_helper.py           |   2 +-
 .../meta_optimizers/sharding_optimizer.py     |  12 +-
 .../parallel_layers/pp_layers.py              |  16 +-
 .../fleet/meta_parallel/pipeline_parallel.py  |   4 +-
 .../fleet/recompute/recompute_hybrid.py       |   4 +-
 .../fleet/runtime/parameter_server_runtime.py |   8 +-
 .../distributed/fleet/runtime/the_one_ps.py   |   8 +-
 python/paddle/distributed/fleet/utils/fs.py   |   8 +-
 .../fleet/utils/hybrid_parallel_inference.py  |  10 +-
 .../fleet/utils/pp_parallel_adaptor.py        |   6 +-
 .../fleet/utils/sequence_parallel_utils.py    |  12 +-
 .../fleet/utils/tensor_parallel_utils.py      |   8 +-
 .../distributed/launch/context/device.py      |   4 +-
 .../launch/controllers/collective.py          |   4 +-
 python/paddle/distributed/launch/job/pod.py   |   4 +-
 .../distributed/launch/plugins/__init__.py    |   4 +-
 python/paddle/distributed/parallel.py         |   8 +-
 .../distributed/passes/auto_parallel_amp.py   |  12 +-
 ...uto_parallel_data_parallel_optimization.py |  14 +-
 .../distributed/passes/auto_parallel_fp16.py  |  16 +-
 .../passes/auto_parallel_grad_clip.py         |   4 +-
 .../passes/auto_parallel_pipeline.py          |   2 +-
 .../passes/auto_parallel_sharding.py          |  13 +-
 .../passes/pipeline_scheduler_pass.py         |   4 +-
 python/paddle/distributed/ps/coordinator.py   |   8 +-
 python/paddle/distributed/ps/the_one_ps.py    |   6 +-
 python/paddle/distributed/ps/utils/public.py  |  12 +-
 .../paddle/distributed/utils/launch_utils.py  |  16 +-
 python/paddle/distributed/utils/nccl_utils.py |   4 +-
 python/paddle/fft.py                          |  40 +--
 python/paddle/framework/io.py                 |  24 +-
 python/paddle/framework/random.py             |   8 +-
 python/paddle/hapi/callbacks.py               |   4 +-
 python/paddle/hapi/dynamic_flops.py           |   4 +-
 python/paddle/hapi/hub.py                     |  16 +-
 python/paddle/hapi/model_summary.py           |   6 +-
 python/paddle/hapi/progressbar.py             |   4 +-
 python/paddle/incubate/asp/asp.py             |   6 +-
 .../incubate/asp/supported_layer_list.py      |   6 +-
 python/paddle/incubate/asp/utils.py           |   8 +-
 .../incubate/distributed/fleet/fleet_util.py  |  28 +-
 .../distributed_strategy.py                   |   4 +-
 .../fleet/parameter_server/ir/trainer_pass.py |  12 +-
 .../incubate/distributed/fleet/utils.py       |   8 +-
 .../incubate/nn/layer/fused_dropout_nd.py     |   4 +-
 .../incubate/nn/layer/fused_transformer.py    |  36 +-
 .../incubate/optimizer/functional/bfgs.py     |   4 +-
 .../incubate/optimizer/functional/lbfgs.py    |   4 +-
 .../incubate/optimizer/gradient_merge.py      |   4 +-
 python/paddle/incubate/optimizer/pipeline.py  |  32 +-
 python/paddle/incubate/optimizer/recompute.py |  20 +-
 python/paddle/incubate/passes/ir.py           |  19 +-
 python/paddle/io/dataloader/batch_sampler.py  |  16 +-
 python/paddle/io/dataloader/collate.py        |   2 +-
 .../paddle/io/dataloader/dataloader_iter.py   |  15 +-
 python/paddle/io/dataloader/flat.py           |   4 +-
 python/paddle/io/dataloader/sampler.py        |   4 +-
 python/paddle/io/dataloader/worker.py         |   4 +-
 python/paddle/io/reader.py                    |   4 +-
 python/paddle/jit/api.py                      |   4 +-
 .../paddle/jit/dy2static/base_transformer.py  |   8 +-
 .../jit/dy2static/basic_api_transformer.py    |   4 +-
 .../paddle/jit/dy2static/convert_operators.py |   2 +-
 .../jit/dy2static/decorator_transformer.py    |   4 +-
 python/paddle/jit/dy2static/error.py          |   8 +-
 python/paddle/jit/dy2static/function_spec.py  |   8 +-
 .../jit/dy2static/logical_transformer.py      |   8 +-
 python/paddle/jit/dy2static/origin_info.py    |   8 +-
 python/paddle/jit/dy2static/utils.py          |  28 +-
 .../jit/dy2static/variable_trans_func.py      |  12 +-
 python/paddle/metric/metrics.py               |  16 +-
 python/paddle/nn/functional/activation.py     |  10 +-
 python/paddle/nn/functional/common.py         |  30 +-
 python/paddle/nn/functional/conv.py           | 104 ++----
 python/paddle/nn/functional/extension.py      |   2 +-
 python/paddle/nn/functional/input.py          |   4 +-
 python/paddle/nn/functional/loss.py           |  56 ++-
 python/paddle/nn/functional/norm.py           |  10 +-
 python/paddle/nn/functional/pooling.py        |  12 +-
 python/paddle/nn/functional/vision.py         |  22 +-
 python/paddle/nn/initializer/initializer.py   |   4 +-
 python/paddle/nn/layer/activation.py          |  12 +-
 python/paddle/nn/layer/common.py              |  28 +-
 python/paddle/nn/layer/container.py           |   4 +-
 python/paddle/nn/layer/layers.py              |  14 +-
 python/paddle/nn/layer/loss.py                |   4 +-
 python/paddle/nn/layer/norm.py                |  22 +-
 python/paddle/nn/layer/pooling.py             |  12 +-
 python/paddle/nn/layer/rnn.py                 |  10 +-
 python/paddle/nn/layer/transformer.py         |  46 +--
 python/paddle/nn/layer/vision.py              |   6 +-
 python/paddle/nn/utils/spectral_norm_hook.py  |   4 +-
 python/paddle/nn/utils/weight_norm_hook.py    |   2 +-
 python/paddle/onnx/export.py                  |   4 +-
 python/paddle/optimizer/lr.py                 |  16 +-
 python/paddle/optimizer/optimizer.py          |  20 +-
 python/paddle/profiler/profiler.py            |  20 +-
 python/paddle/profiler/profiler_statistic.py  |   4 +-
 python/paddle/profiler/timer.py               |   5 +-
 python/paddle/signal.py                       |  16 +-
 python/paddle/sparse/creation.py              |   8 +-
 python/paddle/sparse/nn/functional/conv.py    |  28 +-
 python/paddle/sparse/unary.py                 |   4 +-
 python/paddle/static/amp/bf16/amp_utils.py    |  16 +-
 python/paddle/static/amp/fp16_utils.py        |   4 +-
 python/paddle/static/io.py                    |  20 +-
 python/paddle/static/nn/common.py             |  36 +-
 python/paddle/static/nn/control_flow.py       |  14 +-
 .../static/quantization/quantization_pass.py  |   4 +-
 python/paddle/tensor/linalg.py                |  30 +-
 python/paddle/tensor/manipulation.py          |  42 +--
 python/paddle/tensor/math.py                  |  12 +-
 python/paddle/tensor/to_string.py             |  12 +-
 .../utils/cpp_extension/cpp_extension.py      |   8 +-
 .../utils/cpp_extension/extension_utils.py    |  28 +-
 python/paddle/utils/deprecated.py             |   6 +-
 python/paddle/utils/dlpack.py                 |   4 +-
 python/paddle/utils/download.py               |  30 +-
 python/paddle/utils/install_check.py          |  16 +-
 python/paddle/utils/layers_utils.py           |   2 +-
 python/paddle/vision/datasets/cifar.py        |   4 +-
 python/paddle/vision/datasets/flowers.py      |   4 +-
 python/paddle/vision/datasets/mnist.py        |   4 +-
 python/paddle/vision/datasets/voc2012.py      |   4 +-
 python/paddle/vision/image.py                 |   8 +-
 python/paddle/vision/models/densenet.py       |   4 +-
 python/paddle/vision/models/mobilenetv3.py    |   4 +-
 python/paddle/vision/models/squeezenet.py     |   4 +-
 .../vision/transforms/functional_tensor.py    |   4 +-
 python/paddle/vision/transforms/transforms.py |  12 +-
 test/book/test_word2vec_book.py               |   4 +-
 test/cinn/op_mappers/op_mapper_test.py        |   8 +-
 test/cinn/passes/pass_test.py                 |   8 +-
 test/cinn/test_paddle_model_convertor.py      |   4 +-
 .../fleet/parallel_dygraph_se_resnext.py      |   4 +-
 .../fleet/test_parallel_dygraph_pp_adaptor.py |   8 +-
 .../test_multi_precision_fp16_train.py        |  12 +-
 .../api/full_ILSVRC2012_val_preprocess.py     |   4 +-
 .../cpp_extension/test_cpp_extension_setup.py |   4 +-
 .../test_mixed_extension_setup.py             |  16 +-
 test/cpp_extension/utils.py                   |   8 +-
 test/custom_kernel/test_custom_kernel_dot.py  |   8 +-
 test/custom_kernel/test_custom_kernel_load.py |   8 +-
 test/custom_op/test_context_pool.py           |   4 +-
 test/custom_op/test_custom_attrs_jit.py       |   4 +-
 test/custom_op/test_custom_cast_op_jit.py     |   4 +-
 test/custom_op/test_custom_concat.py          |   4 +-
 test/custom_op/test_custom_relu_op_jit.py     |  12 +-
 test/custom_op/test_custom_relu_op_setup.py   |   8 +-
 .../test_custom_relu_op_xpu_setup.py          |   4 +-
 test/custom_op/test_custom_simple_slice.py    |   4 +-
 test/custom_op/test_custom_tensor_operator.py |   4 +-
 test/custom_op/utils.py                       |   8 +-
 .../test_collective_process_group_xccl.py     |   4 +-
 test/custom_runtime/test_custom_cpu_plugin.py |   4 +-
 .../test_custom_cpu_profiler_plugin.py        |   4 +-
 .../test_custom_cpu_to_static.py              |   4 +-
 test/custom_runtime/test_custom_op_setup.py   |  24 +-
 .../distributed_passes/dist_pass_test_base.py |   4 +-
 test/dygraph_to_static/test_break_continue.py |   4 +-
 test/dygraph_to_static/test_build_strategy.py |  12 +-
 test/dygraph_to_static/test_cache_program.py  |   4 +-
 test/dygraph_to_static/test_cast.py           |  12 +-
 test/dygraph_to_static/test_container.py      |   4 +-
 test/dygraph_to_static/test_convert_call.py   |   4 +-
 test/dygraph_to_static/test_dict.py           |   8 +-
 test/dygraph_to_static/test_error.py          |  12 +-
 test/dygraph_to_static/test_fetch_feed.py     |   4 +-
 test/dygraph_to_static/test_lac.py            |   4 +-
 test/dygraph_to_static/test_layer_hook.py     |   4 +-
 test/dygraph_to_static/test_list.py           |   4 +-
 test/dygraph_to_static/test_logical.py        |   8 +-
 test/dygraph_to_static/test_lstm.py           |  12 +-
 test/dygraph_to_static/test_mnist.py          |   4 +-
 test/dygraph_to_static/test_mnist_amp.py      |   4 +-
 .../dygraph_to_static/test_mnist_pure_fp16.py |   4 +-
 test/dygraph_to_static/test_mobile_net.py     |   8 +-
 test/dygraph_to_static/test_pylayer.py        |   4 +-
 test/dygraph_to_static/test_resnet.py         |  24 +-
 test/dygraph_to_static/test_resnet_amp.py     |   8 +-
 .../test_resnet_pure_fp16.py                  |   8 +-
 test/dygraph_to_static/test_resnet_v2.py      |  24 +-
 test/dygraph_to_static/test_se_resnet.py      |   8 +-
 test/dygraph_to_static/test_seq2seq.py        |   8 +-
 test/dygraph_to_static/yolov3.py              |   4 +-
 test/fft/spectral_op_np.py                    |   4 +-
 .../test_trt_convert_multiclass_nms.py        |   4 +-
 .../test_trt_convert_multiclass_nms3.py       |   4 +-
 test/ir/inference/test_trt_pool3d_op.py       |  12 +-
 test/ir/inference/test_trt_pool_op.py         |   4 +-
 test/legacy_test/auto_parallel_autoconvert.py |  16 +-
 test/legacy_test/benchmark.py                 |   8 +-
 test/legacy_test/dist_fleet_ctr.py            |   8 +-
 test/legacy_test/dist_fleet_ctr_ps_gpu.py     |   4 +-
 test/legacy_test/dist_fleet_simnet_bow.py     |   4 +-
 .../dist_fleet_sparse_embedding_ctr.py        |   6 +-
 .../legacy_test/dist_fleet_sync_batch_norm.py |   8 +-
 test/legacy_test/dist_se_resnext.py           |   4 +-
 test/legacy_test/fleet_meta_optimizer_base.py |   4 +-
 test/legacy_test/gradient_checker.py          |   8 +-
 test/legacy_test/test_chunk_eval_op.py        |   6 +-
 test/legacy_test/test_detach.py               |   4 +-
 test/legacy_test/test_dist_base.py            |  13 +-
 test/legacy_test/test_dist_fleet_base.py      |   8 +-
 .../test_eager_deletion_delete_vars.py        |  10 +-
 test/legacy_test/test_fused_dropout_add_op.py |   4 +-
 .../legacy_test/test_generate_proposals_op.py |   4 +-
 test/legacy_test/test_generator_dataloader.py |   6 +-
 test/legacy_test/test_imperative_resnet.py    |   4 +-
 .../legacy_test/test_imperative_se_resnext.py |   4 +-
 test/legacy_test/test_inplace.py              |  28 +-
 test/legacy_test/test_layers.py               |   8 +-
 test/legacy_test/test_lstm_cudnn_op.py        |   2 +-
 test/legacy_test/test_multi_dot_op.py         |   4 +-
 ...cess_dataloader_iterable_dataset_static.py |   8 +-
 .../test_multiprocess_dataloader_static.py    |   8 +-
 test/legacy_test/test_ops_nms.py              |   4 +-
 test/legacy_test/test_pylayer_op.py           |   4 +-
 test/legacy_test/test_run.py                  |  10 +-
 test/legacy_test/test_sample_logits_op.py     |  12 +-
 test/legacy_test/test_signal.py               |  39 +--
 test/legacy_test/test_static_save_load.py     |   8 +-
 test/legacy_test/test_sync_batch_norm_op.py   |   8 +-
 test/legacy_test/test_translated_layer.py     |   8 +-
 test/legacy_test/test_tril_triu_op.py         |  12 +-
 test/legacy_test/test_variable.py             |  32 +-
 .../test_view_op_reuse_allocation.py          |   4 +-
 ...st_onnx_format_quantization_mobilenetv1.py |   4 +-
 test/ps/static_gpubox_trainer.py              |   4 +-
 ...t2_int8_image_classification_comparison.py |  14 +-
 test/quantization/quant2_int8_lstm_model.py   |  12 +-
 ...nt_int8_image_classification_comparison.py |  12 +-
 test/quantization/test_imperative_ptq.py      |   4 +-
 test/quantization/test_imperative_qat_amp.py  |   4 +-
 ...t_post_training_quantization_lstm_model.py |  18 +-
 .../test_post_training_quantization_mnist.py  |  22 +-
 ..._post_training_quantization_mobilenetv1.py |   4 +-
 .../test_post_training_quantization_while.py  |  12 +-
 .../test_quant_post_quant_aware.py            |  12 +-
 .../test_weight_quantization_mobilenetv1.py   |   4 +-
 test/rnn/rnn_numpy.py                         |   6 +-
 test/tokenizer/bert_tokenizer.py              |   6 +-
 test/tokenizer/tokenizer_utils.py             |   8 +-
 test/xpu/test_generate_proposals_v2_op_xpu.py |   4 +-
 test/xpu/test_tril_triu_op_xpu.py             |   8 +-
 tools/analysisPyXml.py                        |  12 +-
 tools/check_op_benchmark_result.py            |   2 +-
 tools/check_op_desc.py                        |  46 +--
 tools/count_api_without_core_ops.py           |   8 +-
 tools/coverage/gcda_clean.py                  |   6 +-
 tools/coverage/python_coverage.py             |  12 +-
 tools/externalError/spider.py                 |   5 +-
 tools/get_single_test_cov.py                  |  12 +-
 tools/parse_kernel_info.py                    |   2 +-
 tools/print_signatures.py                     |   8 +-
 tools/sampcd_processor_utils.py               |  12 +-
 316 files changed, 1038 insertions(+), 2770 deletions(-)

diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py
index 0c47c23276822..fcfcd17922759 100644
--- a/paddle/phi/api/yaml/generator/api_gen.py
+++ b/paddle/phi/api/yaml/generator/api_gen.py
@@ -305,9 +305,7 @@ def gene_output(
                         )
         else:
             raise ValueError(
-                "{} : Output error: the output should not be empty.".format(
-                    self.api
-                )
+                f"{self.api} : Output error: the output should not be empty."
             )
 
         return kernel_output, output_names, output_create
diff --git a/paddle/phi/api/yaml/generator/backward_api_gen.py b/paddle/phi/api/yaml/generator/backward_api_gen.py
index 9347552dbb134..541a653f3473b 100644
--- a/paddle/phi/api/yaml/generator/backward_api_gen.py
+++ b/paddle/phi/api/yaml/generator/backward_api_gen.py
@@ -237,9 +237,7 @@ def gene_output(
 
         else:
             raise ValueError(
-                "{} : Output error: the output should not be empty.".format(
-                    self.api
-                )
+                f"{self.api} : Output error: the output should not be empty."
             )
 
         return kernel_output, output_names, output_create
diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
index c9885dec64c97..00189d880e67f 100644
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -595,9 +595,7 @@ def generate_output_creation_code(self) -> str:
                         )
         else:
             raise ValueError(
-                "{} : Output error: the output should not be empty.".format(
-                    self.api
-                )
+                f"{self.api} : Output error: the output should not be empty."
             )
 
         return output_creation_code
@@ -1073,9 +1071,7 @@ def generate_reshard_partial_out_to_replicated_code(self) -> str:
                         self.vector_output_size_assertion_check()
             else:
                 raise ValueError(
-                    "{} : Output error: the output should not be empty.".format(
-                        self.api
-                    )
+                    f"{self.api} : Output error: the output should not be empty."
                 )
         else:
             reshard_p2r_code = (
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index e5c89a2ad54e4..b29e186f06d38 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -173,9 +173,7 @@ def generate_output_creation_code(self) -> str:
                     self.vector_output_size_assertion_check()
         else:
             raise ValueError(
-                "{} : Output error: the output should not be empty.".format(
-                    self.api
-                )
+                f"{self.api} : Output error: the output should not be empty."
             )
 
         return output_creation_code
@@ -249,9 +247,7 @@ def generate_reshard_output_code(self):
                         self.vector_output_size_assertion_check()
             else:
                 raise ValueError(
-                    "{} : Output error: the output should not be empty.".format(
-                        self.api
-                    )
+                    f"{self.api} : Output error: the output should not be empty."
                 )
         else:
             # do nothing
diff --git a/paddle/phi/api/yaml/generator/sparse_api_gen.py b/paddle/phi/api/yaml/generator/sparse_api_gen.py
index 9a017725d6888..172f6703c25bf 100644
--- a/paddle/phi/api/yaml/generator/sparse_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py
@@ -88,9 +88,7 @@ def gene_output(
 
         else:
             raise ValueError(
-                "{} : Output error: the output should not be empty.".format(
-                    self.api
-                )
+                f"{self.api} : Output error: the output should not be empty."
             )
 
         return kernel_output, output_names, output_create
diff --git a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
index 064cf07d0dbf7..79f335e8c6050 100644
--- a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
@@ -98,9 +98,7 @@ def gene_output(
 
         else:
             raise ValueError(
-                "{} : Output error: the output should not be empty.".format(
-                    self.api
-                )
+                f"{self.api} : Output error: the output should not be empty."
             )
 
         return kernel_output, output_names, output_create
diff --git a/paddle/phi/api/yaml/generator/strings_api_gen.py b/paddle/phi/api/yaml/generator/strings_api_gen.py
index 4e66bd5f2fdc6..9948de2b40a53 100644
--- a/paddle/phi/api/yaml/generator/strings_api_gen.py
+++ b/paddle/phi/api/yaml/generator/strings_api_gen.py
@@ -107,9 +107,7 @@ def gene_output(
 
         else:
             raise ValueError(
-                "{} : Output error: the output should not be empty.".format(
-                    self.api
-                )
+                f"{self.api} : Output error: the output should not be empty."
             )
 
         return kernel_output, output_names, output_create
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
index 26617ec900534..cbe4571c5d010 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py
@@ -445,13 +445,13 @@ def write_decl_impl(
 
 
 def write_main_header(forward_impl, backward_impl):
-    main_header_content = '''
+    main_header_content = f'''
 #pragma once
 
-#ifdef {}
+#ifdef {ENABLE_MACRO}
 
-#include "{}"
-#include "{}"
+#include "{forward_impl}"
+#include "{backward_impl}"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -528,11 +528,7 @@ def write_main_header(forward_impl, backward_impl):
 #include "./cutlass_backward.h"
 
 #endif
-'''.format(
-        ENABLE_MACRO,
-        forward_impl,
-        backward_impl,
-    )
+'''
 
     path = Path(args.dst_path) / "autogen"
     os.makedirs(path, exist_ok=True)
diff --git a/pyproject.toml b/pyproject.toml
index eca2770cb1b4d..9b247f4a738a9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,17 +122,6 @@ ignore = [
     "PLC0414",
 ]
 
-# UP032
-
-"python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py" = ["UP032"]
-"python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py" = ["UP032"]
-"python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py" = ["UP032"]
-"python/paddle/incubate/optimizer/gradient_merge.py" = ["UP032"]
-"python/paddle/nn/functional/loss.py" = ["UP032"]
-"python/paddle/hapi/dynamic_flops.py" = [ "UP032"]
-"python/paddle/incubate/optimizer/pipeline.py" = ["UP032"]
-
-
 # B017
 "test/auto_parallel/spmd_rules/test_reshape_rule.py" = ["B017"]
 "test/dygraph_to_static/test_assert.py" = ["B017"]
@@ -143,316 +132,3 @@ ignore = [
 "test/legacy_test/test_eigvals_op.py" = ["B017"]
 "test/legacy_test/test_tensordot.py" = ["B017"]
 "test/legacy_test/test_top_k_v2_op.py" = ["B017"]
-
-# UP032
-"paddle/fluid/ir/dialect/op_generator/api_gen.py" = ["UP032"]
-"paddle/fluid/ir/dialect/op_generator/op_gen.py" = ["UP032"]
-"paddle/phi/api/yaml/generator/api_gen.py" = ["UP032"]
-"paddle/phi/api/yaml/generator/backward_api_gen.py" = ["UP032"]
-"paddle/phi/api/yaml/generator/dist_api_gen.py" = ["UP032"]
-"paddle/phi/api/yaml/generator/dist_bw_api_gen.py" = ["UP032"]
-"paddle/phi/api/yaml/generator/sparse_api_gen.py" = ["UP032"]
-"paddle/phi/api/yaml/generator/sparse_bw_api_gen.py" = ["UP032"]
-"paddle/phi/api/yaml/generator/strings_api_gen.py" = ["UP032"]
-"paddle/phi/kernels/fusion/cutlass/memory_efficient_attention/generate_kernels.py" = ["UP032"]
-"parse_build_time.py" = ["UP032"]
-"python/paddle/amp/accuracy_compare.py" = ["UP032"]
-"python/paddle/audio/backends/init_backend.py" = ["UP032"]
-"python/paddle/audio/backends/wave_backend.py" = ["UP032"]
-"python/paddle/batch.py" = ["UP032"]
-"python/paddle/dataset/common.py" = ["UP032"]
-"python/paddle/device/cuda/__init__.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/interface.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/completion.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/converter.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/cost/base_cost.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/cost_model.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/dist_context.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/dist_op.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/dist_tensor.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/engine.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/graph.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/operators/dist_default.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/partitioner.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/reshard.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/tuner/algorithms.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/tuner/recorder.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/static/utils.py" = ["UP032"]
-"python/paddle/distributed/auto_parallel/strategy.py" = ["UP032"]
-"python/paddle/distributed/cloud_utils.py" = ["UP032"]
-"python/paddle/distributed/communication/group.py" = ["UP032"]
-"python/paddle/distributed/communication/stream/gather.py" = ["UP032"]
-"python/paddle/distributed/fleet/base/orthogonal_strategy.py" = ["UP032"]
-"python/paddle/distributed/fleet/base/role_maker.py" = ["UP032"]
-"python/paddle/distributed/fleet/base/strategy_group.py" = ["UP032"]
-"python/paddle/distributed/fleet/base/util_factory.py" = ["UP032"]
-"python/paddle/distributed/fleet/cloud_utils.py" = ["UP032"]
-"python/paddle/distributed/fleet/elastic/manager.py" = ["UP032"]
-"python/paddle/distributed/fleet/launch.py" = ["UP032"]
-"python/paddle/distributed/fleet/launch_utils.py" = ["UP032"]
-"python/paddle/distributed/fleet/layers/mpu/mp_layers.py" = ["UP032"]
-"python/paddle/distributed/fleet/layers/mpu/mp_ops.py" = ["UP032"]
-"python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py" = ["UP032"]
-"python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py" = ["UP032"]
-"python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py" = ["UP032"]
-"python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py" = ["UP032"]
-"python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py" = ["UP032"]
-"python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py" = ["UP032"]
-"python/paddle/distributed/fleet/recompute/recompute_hybrid.py" = ["UP032"]
-"python/paddle/distributed/fleet/runtime/parameter_server_runtime.py" = ["UP032"]
-"python/paddle/distributed/fleet/runtime/the_one_ps.py" = ["UP032"]
-"python/paddle/distributed/fleet/utils/fs.py" = ["UP032"]
-"python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py" = ["UP032"]
-"python/paddle/distributed/fleet/utils/sequence_parallel_utils.py" = ["UP032"]
-"python/paddle/distributed/launch/context/device.py" = ["UP032"]
-"python/paddle/distributed/launch/controllers/collective.py" = ["UP032"]
-"python/paddle/distributed/launch/job/pod.py" = ["UP032"]
-"python/paddle/distributed/launch/plugins/__init__.py" = ["UP032"]
-"python/paddle/distributed/parallel.py" = ["UP032"]
-"python/paddle/distributed/passes/auto_parallel_amp.py" = ["UP032"]
-"python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py" = ["UP032"]
-"python/paddle/distributed/passes/auto_parallel_fp16.py" = ["UP032"]
-"python/paddle/distributed/passes/auto_parallel_grad_clip.py" = ["UP032"]
-"python/paddle/distributed/passes/auto_parallel_pipeline.py" = ["UP032"]
-"python/paddle/distributed/passes/auto_parallel_sharding.py" = ["UP032"]
-"python/paddle/distributed/passes/pipeline_scheduler_pass.py" = ["UP032"]
-"python/paddle/distributed/ps/coordinator.py" = ["UP032"]
-"python/paddle/distributed/ps/the_one_ps.py" = ["UP032"]
-"python/paddle/distributed/ps/utils/public.py" = ["UP032"]
-"python/paddle/distributed/utils/launch_utils.py" = ["UP032"]
-"python/paddle/distributed/utils/nccl_utils.py" = ["UP032"]
-"python/paddle/fft.py" = ["UP032"]
-"python/paddle/framework/io.py" = ["UP032"]
-"python/paddle/framework/random.py" = ["UP032"]
-"python/paddle/hapi/callbacks.py" = ["UP032"]
-"python/paddle/hapi/hub.py" = ["UP032"]
-"python/paddle/hapi/model_summary.py" = ["UP032"]
-"python/paddle/hapi/progressbar.py" = ["UP032"]
-"python/paddle/incubate/asp/asp.py" = ["UP032"]
-"python/paddle/incubate/asp/supported_layer_list.py" = ["UP032"]
-"python/paddle/incubate/asp/utils.py" = ["UP032"]
-"python/paddle/incubate/distributed/fleet/fleet_util.py" = ["UP032"]
-"python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py" = ["UP032"]
-"python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py" = ["UP032"]
-"python/paddle/incubate/distributed/fleet/utils.py" = ["UP032"]
-"python/paddle/incubate/nn/layer/fused_dropout_nd.py" = ["UP032"]
-"python/paddle/incubate/nn/layer/fused_transformer.py" = ["UP032"]
-"python/paddle/incubate/optimizer/functional/bfgs.py" = ["UP032"]
-"python/paddle/incubate/optimizer/functional/lbfgs.py" = ["UP032"]
-"python/paddle/incubate/optimizer/recompute.py" = ["UP032"]
-"python/paddle/incubate/passes/ir.py" = ["UP032"]
-"python/paddle/io/dataloader/batch_sampler.py" = ["UP032"]
-"python/paddle/io/dataloader/collate.py" = ["UP032"]
-"python/paddle/io/dataloader/dataloader_iter.py" = ["UP032"]
-"python/paddle/io/dataloader/flat.py" = ["UP032"]
-"python/paddle/io/dataloader/sampler.py" = ["UP032"]
-"python/paddle/io/dataloader/worker.py" = ["UP032"]
-"python/paddle/io/reader.py" = ["UP032"]
-"python/paddle/jit/api.py" = ["UP032"]
-"python/paddle/jit/dy2static/base_transformer.py" = ["UP032"]
-"python/paddle/jit/dy2static/basic_api_transformer.py" = ["UP032"]
-"python/paddle/jit/dy2static/convert_operators.py" = ["UP032"]
-"python/paddle/jit/dy2static/decorator_transformer.py" = ["UP032"]
-"python/paddle/jit/dy2static/error.py" = ["UP032"]
-"python/paddle/jit/dy2static/function_spec.py" = ["UP032"]
-"python/paddle/jit/dy2static/logical_transformer.py" = ["UP032"]
-"python/paddle/jit/dy2static/origin_info.py" = ["UP032"]
-"python/paddle/jit/dy2static/utils.py" = ["UP032"]
-"python/paddle/jit/dy2static/variable_trans_func.py" = ["UP032"]
-"python/paddle/metric/metrics.py" = ["UP032"]
-"python/paddle/nn/functional/activation.py" = ["UP032"]
-"python/paddle/nn/functional/common.py" = ["UP032"]
-"python/paddle/nn/functional/conv.py" = ["UP032"]
-"python/paddle/nn/functional/extension.py" = ["UP032"]
-"python/paddle/nn/functional/input.py" = ["UP032"]
-"python/paddle/nn/functional/norm.py" = ["UP032"]
-"python/paddle/nn/functional/pooling.py" = ["UP032"]
-"python/paddle/nn/functional/vision.py" = ["UP032"]
-"python/paddle/nn/initializer/initializer.py" = ["UP032"]
-"python/paddle/nn/layer/activation.py" = ["UP032"]
-"python/paddle/nn/layer/common.py" = ["UP032"]
-"python/paddle/nn/layer/container.py" = ["UP032"]
-"python/paddle/nn/layer/layers.py" = ["UP032"]
-"python/paddle/nn/layer/loss.py" = ["UP032"]
-"python/paddle/nn/layer/norm.py" = ["UP032"]
-"python/paddle/nn/layer/pooling.py" = ["UP032"]
-"python/paddle/nn/layer/rnn.py" = ["UP032"]
-"python/paddle/nn/layer/transformer.py" = ["UP032"]
-"python/paddle/nn/layer/vision.py" = ["UP032"]
-"python/paddle/nn/utils/spectral_norm_hook.py" = ["UP032"]
-"python/paddle/nn/utils/weight_norm_hook.py" = ["UP032"]
-"python/paddle/onnx/export.py" = ["UP032"]
-"python/paddle/optimizer/lr.py" = ["UP032"]
-"python/paddle/optimizer/optimizer.py" = ["UP032"]
-"python/paddle/profiler/profiler.py" = ["UP032"]
-"python/paddle/profiler/profiler_statistic.py" = ["UP032"]
-"python/paddle/profiler/timer.py" = ["UP032"]
-"python/paddle/signal.py" = ["UP032"]
-"python/paddle/sparse/creation.py" = ["UP032"]
-"python/paddle/sparse/nn/functional/conv.py" = ["UP032"]
-"python/paddle/sparse/unary.py" = ["UP032"]
-"python/paddle/static/amp/bf16/amp_utils.py" = ["UP032"]
-"python/paddle/static/amp/fp16_utils.py" = ["UP032"]
-"python/paddle/static/io.py" = ["UP032"]
-"python/paddle/static/nn/common.py" = ["UP032"]
-"python/paddle/static/nn/control_flow.py" = ["UP032"]
-"python/paddle/static/quantization/quantization_pass.py" = ["UP032"]
-"python/paddle/tensor/linalg.py" = ["UP032"]
-"python/paddle/tensor/manipulation.py" = ["UP032"]
-"python/paddle/tensor/math.py" = ["UP032"]
-"python/paddle/tensor/to_string.py" = ["UP032"]
-"python/paddle/utils/cpp_extension/cpp_extension.py" = ["UP032"]
-"python/paddle/utils/cpp_extension/extension_utils.py" = ["UP032"]
-"python/paddle/utils/deprecated.py" = ["UP032"]
-"python/paddle/utils/dlpack.py" = ["UP032"]
-"python/paddle/utils/download.py" = ["UP032"]
-"python/paddle/utils/install_check.py" = ["UP032"]
-"python/paddle/utils/layers_utils.py" = ["UP032"]
-"python/paddle/vision/datasets/cifar.py" = ["UP032"]
-"python/paddle/vision/datasets/flowers.py" = ["UP032"]
-"python/paddle/vision/datasets/mnist.py" = ["UP032"]
-"python/paddle/vision/datasets/voc2012.py" = ["UP032"]
-"python/paddle/vision/image.py" = ["UP032"]
-"python/paddle/vision/models/densenet.py" = ["UP032"]
-"python/paddle/vision/models/mobilenetv3.py" = ["UP032"]
-"python/paddle/vision/models/squeezenet.py" = ["UP032"]
-"python/paddle/vision/transforms/functional_tensor.py" = ["UP032"]
-"python/paddle/vision/transforms/transforms.py" = ["UP032"]
-"test/book/test_word2vec_book.py" = ["UP032"]
-"test/cinn/op_mappers/op_mapper_test.py" = ["UP032"]
-"test/cinn/passes/pass_test.py" = ["UP032"]
-"test/cinn/test_paddle_model_convertor.py" = ["UP032"]
-"test/collective/fleet/parallel_dygraph_se_resnext.py" = ["UP032"]
-"test/collective/fleet/test_parallel_dygraph_pp_adaptor.py" = ["UP032"]
-"test/contrib/test_multi_precision_fp16_train.py" = ["UP032"]
-"test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py" = ["UP032"]
-"test/cpp_extension/test_cpp_extension_setup.py" = ["UP032"]
-"test/cpp_extension/test_mixed_extension_setup.py" = ["UP032"]
-"test/cpp_extension/utils.py" = ["UP032"]
-"test/custom_kernel/test_custom_kernel_dot.py" = ["UP032"]
-"test/custom_op/test_context_pool.py" = ["UP032"]
-"test/custom_op/test_custom_attrs_jit.py" = ["UP032"]
-"test/custom_op/test_custom_cast_op_jit.py" = ["UP032"]
-"test/custom_op/test_custom_concat.py" = ["UP032"]
-"test/custom_op/test_custom_relu_op_jit.py" = ["UP032"]
-"test/custom_op/test_custom_relu_op_setup.py" = ["UP032"]
-"test/custom_op/test_custom_relu_op_xpu_setup.py" = ["UP032"]
-"test/custom_op/test_custom_simple_slice.py" = ["UP032"]
-"test/custom_op/test_custom_tensor_operator.py" = ["UP032"]
-"test/custom_op/utils.py" = ["UP032"]
-"test/custom_runtime/test_collective_process_group_xccl.py" = ["UP032"]
-"test/custom_runtime/test_custom_cpu_plugin.py" = ["UP032"]
-"test/custom_runtime/test_custom_cpu_profiler_plugin.py" = ["UP032"]
-"test/custom_runtime/test_custom_cpu_to_static.py" = ["UP032"]
-"test/custom_runtime/test_custom_op_setup.py" = ["UP032"]
-"test/distributed_passes/dist_pass_test_base.py" = ["UP032"]
-"test/dygraph_to_static/test_break_continue.py" = ["UP032"]
-"test/dygraph_to_static/test_build_strategy.py" = ["UP032"]
-"test/dygraph_to_static/test_cache_program.py" = ["UP032"]
-"test/dygraph_to_static/test_cast.py" = ["UP032"]
-"test/dygraph_to_static/test_container.py" = ["UP032"]
-"test/dygraph_to_static/test_convert_call.py" = ["UP032"]
-"test/dygraph_to_static/test_dict.py" = ["UP032"]
-"test/dygraph_to_static/test_error.py" = ["UP032"]
-"test/dygraph_to_static/test_fetch_feed.py" = ["UP032"]
-"test/dygraph_to_static/test_lac.py" = ["UP032"]
-"test/dygraph_to_static/test_layer_hook.py" = ["UP032"]
-"test/dygraph_to_static/test_list.py" = ["UP032"]
-"test/dygraph_to_static/test_logical.py" = ["UP032"]
-"test/dygraph_to_static/test_lstm.py" = ["UP032"]
-"test/dygraph_to_static/test_mnist.py" = ["UP032"]
-"test/dygraph_to_static/test_mnist_amp.py" = ["UP032"]
-"test/dygraph_to_static/test_mnist_pure_fp16.py" = ["UP032"]
-"test/dygraph_to_static/test_mobile_net.py" = ["UP032"]
-"test/dygraph_to_static/test_resnet.py" = ["UP032"]
-"test/dygraph_to_static/test_resnet_amp.py" = ["UP032"]
-"test/dygraph_to_static/test_resnet_pure_fp16.py" = ["UP032"]
-"test/dygraph_to_static/test_resnet_v2.py" = ["UP032"]
-"test/dygraph_to_static/test_se_resnet.py" = ["UP032"]
-"test/dygraph_to_static/test_seq2seq.py" = ["UP032"]
-"test/dygraph_to_static/yolov3.py" = ["UP032"]
-"test/fft/spectral_op_np.py" = ["UP032"]
-"test/ir/inference/test_trt_convert_multiclass_nms.py" = ["UP032"]
-"test/ir/inference/test_trt_convert_multiclass_nms3.py" = ["UP032"]
-"test/ir/inference/test_trt_pool3d_op.py" = ["UP032"]
-"test/ir/inference/test_trt_pool_op.py" = ["UP032"]
-"test/legacy_test/auto_parallel_autoconvert.py" = ["UP032"]
-"test/legacy_test/benchmark.py" = ["UP032"]
-"test/legacy_test/dist_fleet_ctr.py" = ["UP032"]
-"test/legacy_test/dist_fleet_ctr_ps_gpu.py" = ["UP032"]
-"test/legacy_test/dist_fleet_simnet_bow.py" = ["UP032"]
-"test/legacy_test/dist_fleet_sparse_embedding_ctr.py" = ["UP032"]
-"test/legacy_test/dist_fleet_sync_batch_norm.py" = ["UP032"]
-"test/legacy_test/dist_se_resnext.py" = ["UP032"]
-"test/legacy_test/eager_op_test.py" = ["UP032"]
-"test/legacy_test/fleet_meta_optimizer_base.py" = ["UP032"]
-"test/legacy_test/gradient_checker.py" = ["UP032"]
-"test/legacy_test/test_chunk_eval_op.py" = ["UP032"]
-"test/legacy_test/test_detach.py" = ["UP032"]
-"test/legacy_test/test_dist_base.py" = ["UP032"]
-"test/legacy_test/test_dist_fleet_base.py" = ["UP032"]
-"test/legacy_test/test_eager_deletion_delete_vars.py" = ["UP032"]
-"test/legacy_test/test_fused_dropout_add_op.py" = ["UP032"]
-"test/legacy_test/test_generate_proposals_op.py" = ["UP032"]
-"test/legacy_test/test_generator_dataloader.py" = ["UP032"]
-"test/legacy_test/test_imperative_resnet.py" = ["UP032"]
-"test/legacy_test/test_imperative_se_resnext.py" = ["UP032"]
-"test/legacy_test/test_inplace.py" = ["UP032"]
-"test/legacy_test/test_layers.py" = ["UP032"]
-"test/legacy_test/test_lstm_cudnn_op.py" = ["UP032"]
-"test/legacy_test/test_multi_dot_op.py" = ["UP032"]
-"test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py" = ["UP032"]
-"test/legacy_test/test_multiprocess_dataloader_static.py" = ["UP032"]
-"test/legacy_test/test_ops_nms.py" = ["UP032"]
-"test/legacy_test/test_pylayer_op.py" = ["UP032"]
-"test/legacy_test/test_run.py" = ["UP032"]
-"test/legacy_test/test_sample_logits_op.py" = ["UP032"]
-"test/legacy_test/test_signal.py" = ["UP032"]
-"test/legacy_test/test_static_save_load.py" = ["UP032"]
-"test/legacy_test/test_sync_batch_norm_op.py" = ["UP032"]
-"test/legacy_test/test_translated_layer.py" = ["UP032"]
-"test/legacy_test/test_tril_triu_op.py" = ["UP032"]
-"test/legacy_test/test_variable.py" = ["UP032"]
-"test/legacy_test/test_view_op_reuse_allocation.py" = ["UP032"]
-"test/mkldnn/test_onnx_format_quantization_mobilenetv1.py" = ["UP032"]
-"test/ps/static_gpubox_trainer.py" = ["UP032"]
-"test/quantization/quant2_int8_image_classification_comparison.py" = ["UP032"]
-"test/quantization/quant2_int8_lstm_model.py" = ["UP032"]
-"test/quantization/quant_int8_image_classification_comparison.py" = ["UP032"]
-"test/quantization/test_imperative_ptq.py" = ["UP032"]
-"test/quantization/test_imperative_qat_amp.py" = ["UP032"]
-"test/quantization/test_post_training_quantization_lstm_model.py" = ["UP032"]
-"test/quantization/test_post_training_quantization_mnist.py" = ["UP032"]
-"test/quantization/test_post_training_quantization_mobilenetv1.py" = ["UP032"]
-"test/quantization/test_post_training_quantization_while.py" = ["UP032"]
-"test/quantization/test_quant_post_quant_aware.py" = ["UP032"]
-"test/quantization/test_weight_quantization_mobilenetv1.py" = ["UP032"]
-"test/rnn/rnn_numpy.py" = ["UP032"]
-"test/tokenizer/bert_tokenizer.py" = ["UP032"]
-"test/tokenizer/tokenizer_utils.py" = ["UP032"]
-"test/xpu/test_generate_proposals_v2_op_xpu.py" = ["UP032"]
-"test/xpu/test_tril_triu_op_xpu.py" = ["UP032"]
-"tools/analysisPyXml.py" = ["UP032"]
-"tools/check_op_benchmark_result.py" = ["UP032"]
-"tools/check_op_desc.py" = ["UP032"]
-"tools/count_api_without_core_ops.py" = ["UP032"]
-"tools/coverage/gcda_clean.py" = ["UP032"]
-"tools/coverage/python_coverage.py" = ["UP032"]
-"tools/externalError/spider.py" = ["UP032"]
-"tools/get_single_test_cov.py" = ["UP032"]
-"tools/parse_kernel_info.py" = ["UP032"]
-"tools/print_signatures.py" = ["UP032"]
-"tools/sampcd_processor_utils.py" = ["UP032"]
-"test/custom_kernel/test_custom_kernel_load.py" = ["UP032"]
-"python/paddle/distributed/fleet/utils/tensor_parallel_utils.py" = ["UP032"]
diff --git a/python/paddle/amp/accuracy_compare.py b/python/paddle/amp/accuracy_compare.py
index 761f088c99f8f..52a4c4c2ef85d 100644
--- a/python/paddle/amp/accuracy_compare.py
+++ b/python/paddle/amp/accuracy_compare.py
@@ -705,35 +705,23 @@ def compare_accuracy(
     )
 
     for filename in sorted(workerlog_filenames):
-        print(
-            "-- [Step 1/4] Parsing FP32 logs under {}/{}".format(
-                dump_path, filename
-            )
-        )
+        print(f"-- [Step 1/4] Parsing FP32 logs under {dump_path}/{filename}")
         fp32_tensor_info_list, fp32_has_tensor_name = parse_log(
             dump_path, filename, None
         )
         print(
-            "-- [Step 2/4] Parsing FP16 logs under {}/{}".format(
-                another_dump_path, filename
-            )
+            f"-- [Step 2/4] Parsing FP16 logs under {another_dump_path}/{filename}"
         )
         fp16_tensor_info_list, fp16_has_tensor_name = parse_log(
             another_dump_path, filename, None
         )
 
-        print(
-            "-- [Step 3/4] Merge FP32 and FP16 tensor info for {}".format(
-                filename
-            )
-        )
+        print(f"-- [Step 3/4] Merge FP32 and FP16 tensor info for {filename}")
         mp_tensor_info_list = merge_tensor_info_list(
             fp32_tensor_info_list, fp16_tensor_info_list, grad_scale
         )
         print(
-            "-- [Step 4/4] Add worksheet for mixed precision tensor info of {}".format(
-                filename
-            )
+            f"-- [Step 4/4] Add worksheet for mixed precision tensor info of {filename}"
         )
         excel_writer.add_worksheet(
             mp_tensor_info_list,
diff --git a/python/paddle/audio/backends/init_backend.py b/python/paddle/audio/backends/init_backend.py
index e9793bcd9736c..12e3a0d84c9e3 100644
--- a/python/paddle/audio/backends/init_backend.py
+++ b/python/paddle/audio/backends/init_backend.py
@@ -83,9 +83,9 @@ def list_available_backends() -> List[str]:
         version = paddleaudio.__version__
         if not _check_version(version):
             err_msg = (
-                "the version of paddleaudio installed is {},\n"
+                f"the version of paddleaudio installed is {version},\n"
                 "please ensure the paddleaudio >= 1.0.2."
-            ).format(version)
+            )
             raise ImportError(err_msg)
         backends = paddleaudio.backends.list_audio_backends()
     backends.append("wave_backend")
diff --git a/python/paddle/audio/backends/wave_backend.py b/python/paddle/audio/backends/wave_backend.py
index 1dcd48e1917af..262ccafeb304a 100644
--- a/python/paddle/audio/backends/wave_backend.py
+++ b/python/paddle/audio/backends/wave_backend.py
@@ -28,9 +28,9 @@ def _error_message():
     warn_msg = (
         "only PCM16 WAV supportted. \n"
         "if want support more other audio types, please "
-        "manually installed (usually with `pip install {}`). \n "
+        f"manually installed (usually with `pip install {package}`). \n "
         "and use paddle.audio.backends.set_backend('soundfile') to set audio backend"
-    ).format(package)
+    )
     return warn_msg
 
 
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index 788e413fa96c9..98e5a6a14545a 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -67,7 +67,7 @@ def batch_reader():
     if batch_size <= 0:
         raise ValueError(
             "batch_size should be a positive integer value, "
-            "but got batch_size={}".format(batch_size)
+            f"but got batch_size={batch_size}"
         )
 
     return batch_reader
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 6b51b27c66712..4695b633ffa0f 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -91,9 +91,7 @@ def download(url, module_name, md5sum, save_name=None):
             retry += 1
         else:
             raise RuntimeError(
-                "Cannot download {} within retry limit {}".format(
-                    url, retry_limit
-                )
+                f"Cannot download {url} within retry limit {retry_limit}"
             )
         sys.stderr.write(
             f"Cache file {filename} not found, downloading {url} \n"
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index ba4b4eedba12a..cb57e674e2017 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -476,9 +476,9 @@ def get_device_properties(device=None):
                 )
         else:
             raise ValueError(
-                "The device type {} is not expected. Because paddle.device.cuda."
+                f"The device type {device} is not expected. Because paddle.device.cuda."
                 "get_device_properties only support int, str or paddle.CUDAPlace. "
-                "Please input appropriate device again!".format(device)
+                "Please input appropriate device again!"
             )
     else:
         device_id = -1
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index ba8003dcd86fd..c8ab91a7346f0 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -70,9 +70,7 @@ def shard_tensor(x, process_mesh=None, shard_spec=None):
     if process_mesh is not None:
         assert isinstance(
             process_mesh, core.ProcessMesh
-        ), "Argument process_mesh {} is not an instance of ProcessMesh".format(
-            process_mesh
-        )
+        ), f"Argument process_mesh {process_mesh} is not an instance of ProcessMesh"
     else:
         process_mesh = get_current_process_mesh()
         assert (
@@ -163,9 +161,7 @@ def shard_op(op, process_mesh=None, in_shard_specs=None, out_shard_specs=None):
     if process_mesh is not None:
         assert isinstance(
             process_mesh, ProcessMesh
-        ), "Argument process_mesh {} is not an instance of ProcessMesh".format(
-            process_mesh
-        )
+        ), f"Argument process_mesh {process_mesh} is not an instance of ProcessMesh"
     else:
         process_mesh = get_current_process_mesh()
         assert (
@@ -176,9 +172,7 @@ def shard_op(op, process_mesh=None, in_shard_specs=None, out_shard_specs=None):
         assert all(
             (isinstance(shard_spec, list) or shard_spec is None)
             for shard_spec in in_shard_specs
-        ), "in_shard_spec {} is not a list of list or None".format(
-            in_shard_specs
-        )
+        ), f"in_shard_spec {in_shard_specs} is not a list of list or None"
         for shard_spec in in_shard_specs:
             if shard_spec is not None:
                 in_dims_mappings.append(
@@ -191,9 +185,7 @@ def shard_op(op, process_mesh=None, in_shard_specs=None, out_shard_specs=None):
         assert all(
             (isinstance(shard_spec, list) or shard_spec is None)
             for shard_spec in out_shard_specs
-        ), "out_shard_spec {} is not a list of list or None".format(
-            out_shard_specs
-        )
+        ), f"out_shard_spec {out_shard_specs} is not a list of list or None"
         for shard_spec in out_shard_specs:
             if shard_spec is not None:
                 out_dims_mappings.append(
diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py
index fc7646fa3b5f3..baf183f55bbae 100644
--- a/python/paddle/distributed/auto_parallel/static/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -1211,9 +1211,7 @@ def _get_op_by_id(ops, id):
                     output_name = grad_op.output_arg_names[0]
                     assert (
                         output_name in grad_var_to_var[appended_grad_times]
-                    ), "sum op's output '{}' has no corresponding var".format(
-                        output_name
-                    )
+                    ), f"sum op's output '{output_name}' has no corresponding var"
                     ref_fwd_var_name = grad_var_to_var[appended_grad_times][
                         output_name
                     ]
@@ -1513,9 +1511,7 @@ def _get_op_by_id(ops, id):
                     output_name = grad_op.output_arg_names[0]
                     assert (
                         output_name in grad_var_to_var
-                    ), "sum op's output '{}' has no corresponding var".format(
-                        output_name
-                    )
+                    ), f"sum op's output '{output_name}' has no corresponding var"
                     ref_fwd_var_name = grad_var_to_var[output_name]
                     ref_fwd_var = vars[ref_fwd_var_name]
                     ref_fwd_dist_attr = (
diff --git a/python/paddle/distributed/auto_parallel/static/converter.py b/python/paddle/distributed/auto_parallel/static/converter.py
index 710dfb43e877b..3c22f14b01a60 100644
--- a/python/paddle/distributed/auto_parallel/static/converter.py
+++ b/python/paddle/distributed/auto_parallel/static/converter.py
@@ -69,7 +69,7 @@ def _check_pre_strategy(self, pre_strategy):
         if not isinstance(pre_strategy, dict):
             raise TypeError(
                 "The type of 'pre_strategy' should be 'dict', "
-                "but got '{}'.".format(str(type(pre_strategy)))
+                f"but got '{str(type(pre_strategy))}'."
             )
         return pre_strategy
 
@@ -82,7 +82,7 @@ def _check_cur_strategy(self, cur_strategy):
         if not isinstance(cur_strategy, dict):
             raise TypeError(
                 "The type of 'cur_strategy' should be 'dict', "
-                "but got '{}'.".format(str(type(cur_strategy)))
+                f"but got '{str(type(cur_strategy))}'."
             )
         return cur_strategy
 
@@ -229,9 +229,7 @@ def convert_with_prefix_match(
                                 + str(err)
                             )
                         self._logger.info(
-                            "tensor [{}] is matched with tensor [{}]".format(
-                                cur_name, pre_name
-                            )
+                            f"tensor [{cur_name}] is matched with tensor [{pre_name}]"
                         )
                         tensor_match_with_pre.append(cur_name)
                         tensor_match_with_cur.append(pre_name)
@@ -309,9 +307,7 @@ def merge_with_dist_attr(tensor_list, dist_attr):
 
         if len(partition_tensor_list) != 1:
             raise ValueError(
-                "Fail to merge tensor with dist_attr '{}'.".format(
-                    str(dist_attr)
-                )
+                f"Fail to merge tensor with dist_attr '{str(dist_attr)}'."
             )
         complete_tensor = partition_tensor_list[0][0]
         return complete_tensor
@@ -336,9 +332,7 @@ def slice_with_dist_attr(tensor, dist_attr):
         )
         if sliced_tensor_index not in range(len(sliced_tensor_list)):
             raise ValueError(
-                "Fail to slice tensor with dist_attr '{}'.".format(
-                    str(dist_attr)
-                )
+                f"Fail to slice tensor with dist_attr '{str(dist_attr)}'."
             )
         sliced_tensor = sliced_tensor_list[sliced_tensor_index]
         return sliced_tensor
diff --git a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
index f89a03647cfcc..7b87b31865dc0 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
@@ -846,9 +846,7 @@ def group_ranks(self):
                 process_group = get_process_group(ring_id)
                 if process_group is None:
                     raise ValueError(
-                        "There not exists process group whose ring_id is {}.".format(
-                            ring_id
-                        )
+                        f"There not exists process group whose ring_id is {ring_id}."
                     )
                 self._group_ranks = process_group.ranks
         return self._group_ranks
@@ -858,9 +856,7 @@ def _check_comm_op_type(cls):
         if cls.OP_TYPE != "COMM":
             if cls.OP_TYPE not in COMM_OP_TYPE:
                 raise TypeError(
-                    "Please Check op type in {}, but got {}.".format(
-                        COMM_OP_TYPE, cls.OP_TYPE
-                    )
+                    f"Please Check op type in {COMM_OP_TYPE}, but got {cls.OP_TYPE}."
                 )
 
 
@@ -931,9 +927,7 @@ def calc_time_by_cost_model(op, cluster=None):
     """Calc op time by cost model and the unit is microsecond."""
     if not isinstance(op, paddle.base.framework.Operator):
         raise TypeError(
-            "OP must be paddle.base.framework.Operator, but got {}.".format(
-                type(op)
-            )
+            f"OP must be paddle.base.framework.Operator, but got {type(op)}."
         )
     if not cluster:
         cluster = get_default_cluster()
diff --git a/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py b/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
index 17d3b0476081a..38f7a007ceaa6 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/tensor_cost.py
@@ -54,9 +54,7 @@ def _check_args(self, tensor, dist_tensor, shape, dtype):
 
             if not isinstance(tensor, Variable):
                 raise TypeError(
-                    "Please check tensor type is Variable, but got {}".format(
-                        type(tensor)
-                    )
+                    f"Please check tensor type is Variable, but got {type(tensor)}"
                 )
 
         elif dist_tensor is not None:
@@ -72,9 +70,7 @@ def _check_args(self, tensor, dist_tensor, shape, dtype):
             assert tensor is None and dist_tensor is None and dtype is not None
             if not isinstance(shape, (list, set)):
                 raise TypeError(
-                    "Please check shape type is list or set, but got {}".format(
-                        type(shape)
-                    )
+                    f"Please check shape type is list or set, but got {type(shape)}"
                 )
 
         elif dtype is not None:
diff --git a/python/paddle/distributed/auto_parallel/static/cost_model.py b/python/paddle/distributed/auto_parallel/static/cost_model.py
index b3e5bce8ef58f..55690e4f3de8f 100644
--- a/python/paddle/distributed/auto_parallel/static/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/static/cost_model.py
@@ -435,9 +435,7 @@ def _merge_node(self, to_merge_node_list, merge_type='linear', nodes=None):
                 node_cost = max(node_cost, node.cost)
             else:
                 raise NotImplementedError(
-                    'This type of merging is not supported:{}'.format(
-                        merge_type
-                    )
+                    f'This type of merging is not supported:{merge_type}'
                 )
         merged_node_id = 'merged_' + str(len(nodes))
         is_bwd = to_merge_node_list[0].is_bwd
@@ -796,9 +794,7 @@ def _simulate_pipeline(self):
                 global_time[stid] = e.e_time
             else:
                 raise NotImplementedError(
-                    'This type of pipe event is not supported yet.{}'.format(
-                        e.name
-                    )
+                    f'This type of pipe event is not supported yet.{e.name}'
                 )
 
         for t in global_time:
diff --git a/python/paddle/distributed/auto_parallel/static/dist_context.py b/python/paddle/distributed/auto_parallel/static/dist_context.py
index 5eabdd312bbb7..58cf994c2b73b 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_context.py
@@ -1209,9 +1209,9 @@ def parse_forward_blocks(self, program):
         assert self.nblock >= 1
 
     def parse_backward_blocks(self, program):
-        assert 0 in self.forward_indices, "forward block idx are{}".format(
-            self.forward_indices
-        )
+        assert (
+            0 in self.forward_indices
+        ), f"forward block idx are{self.forward_indices}"
         self.backward_to_forward_index_map[0] = 0
 
         for idx, block in enumerate(program.blocks):
diff --git a/python/paddle/distributed/auto_parallel/static/dist_op.py b/python/paddle/distributed/auto_parallel/static/dist_op.py
index 8489d3f3332a6..a728b55697bfa 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_op.py
@@ -124,8 +124,8 @@ def __str__(self):
             annotated_str = "annotated"
         else:
             annotated_str = "non-annotated"
-        str += ", process_mesh ({}): {}".format(
-            annotated_str, self.dist_attr.process_mesh
+        str += (
+            f", process_mesh ({annotated_str}): {self.dist_attr.process_mesh}"
         )
 
         for arg_name in self.serial_op.desc.input_arg_names():
diff --git a/python/paddle/distributed/auto_parallel/static/dist_tensor.py b/python/paddle/distributed/auto_parallel/static/dist_tensor.py
index d44fa513f1a33..32a4f43434118 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_tensor.py
@@ -399,8 +399,8 @@ def __str__(self):
             annotated_str = "annotated"
         else:
             annotated_str = "non-annotated"
-        str += ", process_mesh ({}): {}".format(
-            annotated_str, self.dist_attr.process_mesh
+        str += (
+            f", process_mesh ({annotated_str}): {self.dist_attr.process_mesh}"
         )
 
         str += f", is_parameter: {self.serial_tensor.is_parameter}"
@@ -409,9 +409,7 @@ def __str__(self):
             annotated_str = "annotated"
         else:
             annotated_str = "non-annotated"
-        str += ", dims_mapping ({}): {} }}".format(
-            annotated_str, self.dist_attr.dims_mapping
-        )
+        str += f", dims_mapping ({annotated_str}): {self.dist_attr.dims_mapping} }}"
 
         # if self.dist_attr.is_annotated("shard_mask"):
         #     annotated_str = "annotated"
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
index 0354c6517a1a2..ac45256562440 100644
--- a/python/paddle/distributed/auto_parallel/static/engine.py
+++ b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -158,9 +158,7 @@ def __init__(
         for metric in auto_utils.to_list(metrics):
             if metric and not isinstance(metric, Metric):
                 raise TypeError(
-                    "{} is not sub class of Metric".format(
-                        metric.__class__.__name__
-                    )
+                    f"{metric.__class__.__name__} is not sub class of Metric"
                 )
         self._metrics = auto_utils.to_list(metrics)
 
@@ -331,9 +329,7 @@ def _prepare_data_tensor(self, inputs_spec, labels_spec, inputs, labels):
         if inputs_spec:
             assert isinstance(
                 inputs_spec, list
-            ), "inputs should be list, but received {}".format(
-                type(inputs_spec)
-            )
+            ), f"inputs should be list, but received {type(inputs_spec)}"
             assert isinstance(
                 inputs, list
             ), f"inputs should be list, but received {type(inputs)}"
@@ -346,9 +342,7 @@ def _prepare_data_tensor(self, inputs_spec, labels_spec, inputs, labels):
         if labels_spec:
             assert isinstance(
                 labels_spec, list
-            ), "labels should be list, but received {}".format(
-                type(labels_spec)
-            )
+            ), f"labels should be list, but received {type(labels_spec)}"
             assert isinstance(
                 labels, list
             ), f"labels should be list, but received {type(labels)}"
@@ -457,9 +451,7 @@ def _prepare_feed(self, data, user_feeds, mode):
         if user_feeds is not None:
             assert isinstance(
                 user_feeds, dict
-            ), "user_feeds must be a dict, but receive {}".format(
-                type(user_feeds).__name__
-            )
+            ), f"user_feeds must be a dict, but receive {type(user_feeds).__name__}"
             for name, data in user_feeds.items():
                 feeds[name] = data
         return feeds
@@ -468,9 +460,7 @@ def _prepare_fetch(self, user_fetches, mode):
         if user_fetches is not None:
             assert isinstance(
                 user_fetches, list
-            ), "user_fetches must be a list, but receive {}".format(
-                type(user_fetches).__name__
-            )
+            ), f"user_fetches must be a list, but receive {type(user_fetches).__name__}"
         fetch_names = []
         fetch_indices = []
 
diff --git a/python/paddle/distributed/auto_parallel/static/graph.py b/python/paddle/distributed/auto_parallel/static/graph.py
index d4cace82585b3..5665294487e9c 100644
--- a/python/paddle/distributed/auto_parallel/static/graph.py
+++ b/python/paddle/distributed/auto_parallel/static/graph.py
@@ -84,9 +84,7 @@ def __contains__(self, attr_name):
 
     def __str__(self):
         str = ""
-        str += "(src_id: {}, tgt_id: {}, attrs: {})".format(
-            self.src_id, self.tgt_id, self._attrs
-        )
+        str += f"(src_id: {self.src_id}, tgt_id: {self.tgt_id}, attrs: {self._attrs})"
         return str
 
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py
index a531b3bb8c660..1c490345ff1c6 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_check_finite_and_unscale.py
@@ -85,9 +85,7 @@ def backward(ctx, *args, **kwargs):
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert (
             dist_attr is not None
-        ), "backward op [{}] don't have dist attribute !".format(
-            str(backward_op)
-        )
+        ), f"backward op [{str(backward_op)}] don't have dist attribute !"
 
         assert rank_id in dist_attr.process_mesh.process_ids
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
index 93833651dd185..04bca9c95ddbe 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_default.py
@@ -455,21 +455,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         # replicate op in dist program
         dist_op = main_block.append_op(type='nop')
@@ -575,28 +569,20 @@ def backward(ctx, *args, **kwargs):
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert (
             dist_attr is not None
-        ), "backward op [{}] don't have dist attribute !".format(
-            str(backward_op)
-        )
+        ), f"backward op [{str(backward_op)}] don't have dist attribute !"
         rank_id = dist_op_context.rank_id
 
         # check validation of inputs / outputs
         for input_name in backward_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 backward_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in backward_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 backward_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         # replicate op in dist program
         dist_op_desc = main_block.append_op(type='nop').desc
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
index 4459f9b63f759..42ddfc4b0d4b3 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_embedding.py
@@ -498,17 +498,13 @@ def forward(ctx, *args, **kwargs):
         embedding_op_dist_attr.impl_idx = op_dist_attr.impl_idx
         for input_varname in c_embedding_op.desc.input_arg_names():
             input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
-            assert input_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert input_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             embedding_op_dist_attr.set_input_dist_attr(
                 input_varname, input_dist_attr
             )
         output_varname = c_embedding_op.desc.output_arg_names()[0]
         output_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
-        assert output_dist_attr is not None, "dist_attr is {}".format(
-            op_dist_attr
-        )
+        assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
         embedding_op_dist_attr.set_output_dist_attr(
             output_varname, output_dist_attr
         )
@@ -528,9 +524,7 @@ def forward(ctx, *args, **kwargs):
             )
         for output_varname in c_allreduce_sum_op.desc.output_arg_names():
             output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
-            assert output_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             allreduce_op_dist_attr.set_output_dist_attr(
                 output_varname, output_dist_attr
             )
@@ -583,9 +577,7 @@ def backward(ctx, *args, **kwargs):
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert (
             dist_attr is not None
-        ), "backward op [{}] don't have dist attribute !".format(
-            str(backward_op)
-        )
+        ), f"backward op [{str(backward_op)}] don't have dist attribute !"
 
         # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
         if rank_id not in dist_attr.process_mesh.process_ids:
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
index 57265235a5e9d..1386c5e661cc8 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_matmul.py
@@ -353,9 +353,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
     assert not is_parameter_related(
         X_var.name, main_block
-    ), "left operand(X) [{}] of dist matmul should not be parameter".format(
-        X_var.name
-    )
+    ), f"left operand(X) [{X_var.name}] of dist matmul should not be parameter"
 
     X_var_dims_mapping = dist_attr.get_input_dims_mapping(X_var.name)
     Y_var_dim_mapping = dist_attr.get_input_dims_mapping(Y_var.name)
@@ -722,21 +720,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Weight_var = main_block._var_recursive(kwargs['Y'][0])
@@ -814,9 +806,9 @@ def forward(ctx, *args, **kwargs):
                 input_dist_attr = op_dist_attr.get_input_dist_attr(
                     input_varname
                 )
-                assert input_dist_attr is not None, "dist_attr is {}".format(
-                    op_dist_attr
-                )
+                assert (
+                    input_dist_attr is not None
+                ), f"dist_attr is {op_dist_attr}"
                 matmul_op_dist_attr.set_input_dist_attr(
                     input_varname, input_dist_attr
                 )
@@ -831,9 +823,7 @@ def forward(ctx, *args, **kwargs):
         # output
         output_varname = matmul_op.desc.output_arg_names()[0]
         output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
-        assert output_dist_attr is not None, "dist_attr is {}".format(
-            op_dist_attr
-        )
+        assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
         matmul_op_dist_attr.set_output_dist_attr(
             output_varname, output_dist_attr
         )
@@ -1043,21 +1033,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Weight_var = main_block._var_recursive(kwargs['Y'][0])
@@ -1161,17 +1145,13 @@ def forward(ctx, *args, **kwargs):
         matmul_op_dist_attr.impl_idx = op_dist_attr.impl_idx
         for input_varname in matmul_op.desc.input_arg_names():
             input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
-            assert input_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert input_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             matmul_op_dist_attr.set_input_dist_attr(
                 input_varname, input_dist_attr
             )
         output_varname = matmul_op.desc.output_arg_names()[0]
         output_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
-        assert output_dist_attr is not None, "dist_attr is {}".format(
-            op_dist_attr
-        )
+        assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
         matmul_op_dist_attr.set_output_dist_attr(
             output_varname, output_dist_attr
         )
@@ -1191,9 +1171,7 @@ def forward(ctx, *args, **kwargs):
             )
         for output_varname in c_allreduce_sum_op.desc.output_arg_names():
             output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
-            assert output_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             allreduce_op_dist_attr.set_output_dist_attr(
                 output_varname, output_dist_attr
             )
@@ -1560,21 +1538,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Weight_var = main_block._var_recursive(kwargs['Y'][0])
@@ -1653,9 +1625,9 @@ def forward(ctx, *args, **kwargs):
                 input_dist_attr = op_dist_attr.get_input_dist_attr(
                     input_varname
                 )
-                assert input_dist_attr is not None, "dist_attr is {}".format(
-                    op_dist_attr
-                )
+                assert (
+                    input_dist_attr is not None
+                ), f"dist_attr is {op_dist_attr}"
                 matmulv2_op_dist_attr.set_input_dist_attr(
                     input_varname, input_dist_attr
                 )
@@ -1669,9 +1641,7 @@ def forward(ctx, *args, **kwargs):
                 )
         for output_varname in matmul_v2_op.desc.output_arg_names():
             output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
-            assert output_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             matmulv2_op_dist_attr.set_output_dist_attr(
                 output_varname, output_dist_attr
             )
@@ -1881,21 +1851,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Weight_var = main_block._var_recursive(kwargs['Y'][0])
@@ -1998,17 +1962,13 @@ def forward(ctx, *args, **kwargs):
         matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
         for input_varname in matmul_v2_op.desc.input_arg_names():
             input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
-            assert input_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert input_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             matmulv2_op_dist_attr.set_input_dist_attr(
                 input_varname, input_dist_attr
             )
         output_varname = matmul_v2_op.desc.output_arg_names()[0]
         output_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
-        assert output_dist_attr is not None, "dist_attr is {}".format(
-            op_dist_attr
-        )
+        assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
         matmulv2_op_dist_attr.set_output_dist_attr(
             output_varname, output_dist_attr
         )
@@ -2028,9 +1988,7 @@ def forward(ctx, *args, **kwargs):
             )
         for output_varname in c_allreduce_sum_op.desc.output_arg_names():
             output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
-            assert output_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             allreduce_op_dist_attr.set_output_dist_attr(
                 output_varname, output_dist_attr
             )
@@ -2389,21 +2347,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Weight_var = main_block._var_recursive(kwargs['Y'][0])
@@ -2495,9 +2447,9 @@ def forward(ctx, *args, **kwargs):
                 input_dist_attr = op_dist_attr.get_input_dist_attr(
                     input_varname
                 )
-                assert input_dist_attr is not None, "dist_attr is {}".format(
-                    op_dist_attr
-                )
+                assert (
+                    input_dist_attr is not None
+                ), f"dist_attr is {op_dist_attr}"
                 matmulv2_op_dist_attr.set_input_dist_attr(
                     input_varname, input_dist_attr
                 )
@@ -2511,9 +2463,7 @@ def forward(ctx, *args, **kwargs):
                 )
         for output_varname in mul_op.desc.output_arg_names():
             output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
-            assert output_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             matmulv2_op_dist_attr.set_output_dist_attr(
                 output_varname, output_dist_attr
             )
@@ -2717,21 +2667,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Weight_var = main_block._var_recursive(kwargs['Y'][0])
@@ -2849,17 +2793,13 @@ def forward(ctx, *args, **kwargs):
         matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
         for input_varname in mul_op.desc.input_arg_names():
             input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
-            assert input_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert input_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             matmulv2_op_dist_attr.set_input_dist_attr(
                 input_varname, input_dist_attr
             )
         output_varname = mul_op.desc.output_arg_names()[0]
         output_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
-        assert output_dist_attr is not None, "dist_attr is {}".format(
-            op_dist_attr
-        )
+        assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
         matmulv2_op_dist_attr.set_output_dist_attr(
             output_varname, output_dist_attr
         )
@@ -2879,9 +2819,7 @@ def forward(ctx, *args, **kwargs):
             )
         for output_varname in c_allreduce_sum_op.desc.output_arg_names():
             output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
-            assert output_dist_attr is not None, "dist_attr is {}".format(
-                op_dist_attr
-            )
+            assert output_dist_attr is not None, f"dist_attr is {op_dist_attr}"
             allreduce_op_dist_attr.set_output_dist_attr(
                 output_varname, output_dist_attr
             )
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py
index 3007285addc70..9f322cb5caf8a 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_pnorm.py
@@ -166,21 +166,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         if rank_id not in op_dist_attr.process_mesh.process_ids:
             rank_id = _get_corresponding_rank(
@@ -279,21 +273,15 @@ def backward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in backward_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 backward_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in backward_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 backward_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         X_grad_var = main_block._var_recursive(kwargs['X@GRAD'][0])
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py b/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
index 6cb9721274213..ba74be866c1ee 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_reduce_sum_p.py
@@ -86,21 +86,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         # replicate op in dist program
         dist_op = main_block.append_op(type='nop')
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py
index 267e8437abacc..e89caba2dd68d 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_reshape.py
@@ -246,21 +246,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Out_var = main_block._var_recursive(kwargs['Out'][0])
@@ -508,21 +502,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Out_var = main_block._var_recursive(kwargs['Out'][0])
@@ -763,21 +751,15 @@ def forward(ctx, *args, **kwargs):
 
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
-            assert input_name in kwargs, "input [{}] is not given".format(
-                input_name
-            )
+            assert input_name in kwargs, f"input [{input_name}] is not given"
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), f"number of tensor for input [{input_name}] is not match"
         for output_name in src_op.desc.output_names():
-            assert output_name in kwargs, "input [{}] is not given".format(
-                output_name
-            )
+            assert output_name in kwargs, f"input [{output_name}] is not given"
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
-            ), "number of tensor for input [{}] is not match".format(
-                output_name
-            )
+            ), f"number of tensor for input [{output_name}] is not match"
 
         X_var = main_block._var_recursive(kwargs['X'][0])
         Out_var = main_block._var_recursive(kwargs['Out'][0])
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py
index 1c39dd6b2fd53..8ff358d14b1db 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_update_loss_scaling.py
@@ -74,9 +74,7 @@ def backward(ctx, *args, **kwargs):
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
         assert (
             dist_attr is not None
-        ), "backward op [{}] don't have dist attribute !".format(
-            str(backward_op)
-        )
+        ), f"backward op [{str(backward_op)}] don't have dist attribute !"
 
         assert rank_id in dist_attr.process_mesh.process_ids
 
diff --git a/python/paddle/distributed/auto_parallel/static/partitioner.py b/python/paddle/distributed/auto_parallel/static/partitioner.py
index 78094c73de9e0..b00baf32ec0fe 100644
--- a/python/paddle/distributed/auto_parallel/static/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/static/partitioner.py
@@ -140,14 +140,10 @@ def partition_startup_program(
             output_vars = op.desc.output_arg_names()
             assert (
                 len(output_vars) == 1
-            ), "initializer should output only ONE variable, but got [{}]".format(
-                str(op.desc)
-            )
+            ), f"initializer should output only ONE variable, but got [{str(op.desc)}]"
             assert (
                 temp_varname_map[output_vars[0]] in var2shape
-            ), "try to initialize [{}] which is not a persistable var".format(
-                output_vars[0]
-            )
+            ), f"try to initialize [{output_vars[0]}] which is not a persistable var"
             new_op_desc = target_block.desc.append_op()
             new_op_desc.copy_from(op.desc)
             new_op_desc._rename_output(
@@ -393,9 +389,7 @@ def _get_dist_shape(var, dist_attr):
 
     assert len(var_shape) == len(
         mapping
-    ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
-        var_shape, mapping
-    )
+    ), f"variable shape [{var_shape}] and dim_mapping [{mapping}] is NOT match !"
     new_shape = []
     for idx in range(len(var_shape)):
         if var_shape[idx] == -1 or mapping[idx] == -1:
diff --git a/python/paddle/distributed/auto_parallel/static/reshard.py b/python/paddle/distributed/auto_parallel/static/reshard.py
index 60dff3401dd84..facfe183c5d9a 100644
--- a/python/paddle/distributed/auto_parallel/static/reshard.py
+++ b/python/paddle/distributed/auto_parallel/static/reshard.py
@@ -1001,28 +1001,25 @@ def __init__(
     ):
         assert isinstance(auto_parallel_main_prog, Program), (
             "The type of auto_parallel_main_prog should be Program, "
-            "but got {}.".format(type(auto_parallel_main_prog))
+            f"but got {type(auto_parallel_main_prog)}."
         )
         if auto_parallel_startup_prog is not None:
             assert isinstance(auto_parallel_main_prog, Program), (
                 "The type of auto_parallel_startup_prog should be Program or None, "
-                "but got {}.".format(type(auto_parallel_startup_prog))
+                f"but got {type(auto_parallel_startup_prog)}."
             )
-        assert isinstance(
-            rank_id, int
-        ), "The type of rank_id should be int, " "but got {}.".format(
-            type(rank_id)
+        assert isinstance(rank_id, int), (
+            "The type of rank_id should be int, " f"but got {type(rank_id)}."
         )
         assert isinstance(dist_context, DistributedContext), (
             "The type of dist_context should be DistributedContext, "
-            "but got {}.".format(type(dist_context))
+            f"but got {type(dist_context)}."
         )
 
         if batch_size is not None:
-            assert isinstance(
-                batch_size, int
-            ), "The type of batch_size should be int, " "but got {}.".format(
-                type(batch_size)
+            assert isinstance(batch_size, int), (
+                "The type of batch_size should be int, "
+                f"but got {type(batch_size)}."
             )
 
         self._auto_parallel_main_prog = auto_parallel_main_prog
@@ -1783,9 +1780,7 @@ def parse_op_desc(
                 break
         assert (
             idx is not None
-        ), "The op for reshard cannot be found in the rank {} program.".format(
-            self.rank_id
-        )
+        ), f"The op for reshard cannot be found in the rank {self.rank_id} program."
 
         matched_op = block.ops[idx]
         source_tensor = get_var_with_recursion(
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py b/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
index 5eea035ea92fd..37cb3ed501181 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/algorithms.py
@@ -199,9 +199,7 @@ def next_trial(self):
                 new_strategy = copy.deepcopy(self._config.dist_strategy)
                 recompute = new_strategy.recompute
                 recompute.no_recompute_segments.extend(new_no_recompute)
-                name = "trial-recompute-part-segments-idx{}".format(
-                    self._trial_idx
-                )
+                name = f"trial-recompute-part-segments-idx{self._trial_idx}"
                 return Trial(new_strategy, name, self.changed_configs)
         else:
             return Trial(None, None, None, status=TrialStatus.STOPPED)
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
index 64eaca28c06ea..6a3365eff018b 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/optimization_tuner.py
@@ -538,9 +538,7 @@ def _evaluate_trial(self, trial):
             )
 
         self._logger.info(
-            "Trial {} evaluation finish with {}.".format(
-                trial.name, parse_results(results)
-            )
+            f"Trial {trial.name} evaluation finish with {parse_results(results)}."
         )
         return results
 
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/recorder.py b/python/paddle/distributed/auto_parallel/static/tuner/recorder.py
index 6faaac8977910..a1ed12187260a 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/recorder.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/recorder.py
@@ -70,9 +70,7 @@ class MetricRecords:
     def __init__(self, direction="min"):
         if direction not in {"min", "max"}:
             raise ValueError(
-                "direction should be one of {{min, max}}, but got: {}.".format(
-                    direction
-                )
+                f"direction should be one of {{min, max}}, but got: {direction}."
             )
         self._direction = direction
         self._records = {}
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
index ad21e4f00109a..07d98d67226d7 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/rule_based_tuner.py
@@ -1577,9 +1577,7 @@ def _is_grad_var_name(name):
                     output_name = grad_op_next_op.output_arg_names[0]
                     assert (
                         output_name in grad_var_to_var
-                    ), "sum op's output '{}' has no corresponding var".format(
-                        output_name
-                    )
+                    ), f"sum op's output '{output_name}' has no corresponding var"
                     ref_fwd_var_name = grad_var_to_var[output_name]
                     ref_fwd_var = vars[ref_fwd_var_name]
                     ref_fwd_dist_attr = sub_program_dist_context.get_tensor_dist_attr_for_program(
@@ -2098,9 +2096,7 @@ def prepare(self):
         self.layers = self.cluster_operators()
         end = time.time()
         self._logger.info(
-            "Cluster operators to {} layers in {:.2f}s.".format(
-                len(self.layers), end - begin
-            )
+            f"Cluster operators to {len(self.layers)} layers in {end - begin:.2f}s."
         )
 
         # step2: generate sub program of each layer
@@ -2175,9 +2171,7 @@ def prepare(self):
             self.complete_sub_bwd_programs()
             end = time.time()
             self._logger.info(
-                "Complete all sub backward programs in {:.2f}s.".format(
-                    end - begin
-                )
+                f"Complete all sub backward programs in {end - begin:.2f}s."
             )
 
             # step8: complete update sub programs
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py
index 6f46ccb90132a..1aa46f4966157 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/tunable_variable.py
@@ -49,9 +49,7 @@ def __init__(self, name, default):
         self.name = name
         if not isinstance(default, (str, int, float, bool)):
             raise ValueError(
-                "Fixed must be an str, int, float or bool, but found {}".format(
-                    default
-                )
+                f"Fixed must be an str, int, float or bool, but found {default}"
             )
         self._default = default
 
@@ -79,9 +77,7 @@ def random(self, seed=None):
         return rng.choice((True, False))
 
     def __repr__(self):
-        return 'Boolean(name: "{}", default: {})'.format(
-            self.name, self.default
-        )
+        return f'Boolean(name: "{self.name}", default: {self.default})'
 
 
 class Choice(TunableVariable):
diff --git a/python/paddle/distributed/auto_parallel/static/utils.py b/python/paddle/distributed/auto_parallel/static/utils.py
index c4b3e01839891..2e41c6de99802 100644
--- a/python/paddle/distributed/auto_parallel/static/utils.py
+++ b/python/paddle/distributed/auto_parallel/static/utils.py
@@ -285,9 +285,9 @@ def _get_comm_group(processes, shape, axis, rank):
 
     # NOTE _linear_idx2coordinate assume processes mesh start with 0 and continuous
     # tricks to support processes mesh when it is not start with 0 or continuous
-    assert rank in processes, "rank [{}] is NOT in processes group {}".format(
-        rank, processes
-    )
+    assert (
+        rank in processes
+    ), f"rank [{rank}] is NOT in processes group {processes}"
     rank_relatvie = processes.index(rank)
     coordinate = _linear_idx2coordinate(shape, rank_relatvie)
     coordinates_in_group = [coordinate[:] for i in range(shape[axis])]
@@ -361,9 +361,7 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
     for i in range(len(mesh_shape)):
         assert (
             coordinate[i] >= 0
-        ), "index in dimension [{}] is least than zero. coordinate: {}".format(
-            i, coordinate
-        )
+        ), f"index in dimension [{i}] is least than zero. coordinate: {coordinate}"
         assert (
             coordinate[i] < mesh_shape[i]
         ), "index beyond extent in dimension [{}]. shape: {}, coordinate: {}".format(
@@ -400,9 +398,7 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):
 
     """
 
-    assert linear_idx >= 0, "linear index [{}] is least than zero".format(
-        linear_idx
-    )
+    assert linear_idx >= 0, f"linear index [{linear_idx}] is least than zero"
     assert linear_idx < np.prod(
         mesh_shape
     ), "linear index beyond the extent of mesh shape. shape: {}, linear index: {}".format(
@@ -450,9 +446,7 @@ def _get_unshard_dist_shape(var, dist_attr):
     mesh = dist_attr.process_mesh.shape
     assert len(var_shape) == len(
         mapping
-    ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
-        var_shape, mapping
-    )
+    ), f"variable shape [{var_shape}] and dim_mapping [{mapping}] is NOT match !"
     new_shape = []
     for idx in range(len(var_shape)):
         if var_shape[idx] == -1 or mapping[idx] == -1:
@@ -490,21 +484,19 @@ def _update_addition_info(addition_info):
     elif not isinstance(addition_info, dict):
         raise TypeError(
             "The type of 'addition_info' should be 'dict', "
-            "but got '{}'.".format(str(type(addition_info)))
+            f"but got '{str(type(addition_info))}'."
         )
     else:
         for item, value in addition_info.items():
             if item not in ["epoch", "batch", "batch_size"]:
                 raise ValueError(
                     "The key of 'addition_info' should be one of the "
-                    "['epoch', 'batch', 'batch_size'], but got '{}'.".format(
-                        str(item)
-                    )
+                    f"['epoch', 'batch', 'batch_size'], but got '{str(item)}'."
                 )
             if not isinstance(value, int):
                 raise ValueError(
                     "The value of 'addition_info' should be 'int', "
-                    "but got '{}'.".format(str(type(value)))
+                    f"but got '{str(type(value))}'."
                 )
             add_info[item] = value
         return add_info
@@ -519,7 +511,7 @@ def _check_valid_path(file_path):
             if not isinstance(file, str):
                 raise TypeError(
                     "The type of file path should be 'str', "
-                    "but got '{}'.".format(str(type(file)))
+                    f"but got '{str(type(file))}'."
                 )
             if not os.path.exists(file):
                 raise ValueError(f"The file path '{file}' does not exist.")
@@ -527,7 +519,7 @@ def _check_valid_path(file_path):
     else:
         raise TypeError(
             "The type of file path should be 'list', "
-            "but got '{}'.".format(str(type(file_path)))
+            f"but got '{str(type(file_path))}'."
         )
 
 
@@ -537,19 +529,19 @@ def _check_param_dict(param_dict):
     elif not isinstance(param_dict, dict):
         raise TypeError(
             "The type of 'param_dict' should be 'dict', "
-            "but got '{}'.".format(str(type(param_dict)))
+            f"but got '{str(type(param_dict))}'."
         )
     else:
         for name, value in param_dict.items():
             if not isinstance(name, str):
                 raise TypeError(
                     "The type of key of 'param_dict' should be 'str', "
-                    "but got '{}'.".format(str(type(name)))
+                    f"but got '{str(type(name))}'."
                 )
             if not isinstance(value, paddle.base.LoDTensor):
                 raise TypeError(
                     "The type of value of 'param_dict' should be 'LoDTensor', "
-                    "but got '{}'.".format(str(type(value)))
+                    f"but got '{str(type(value))}'."
                 )
         return param_dict
 
@@ -560,26 +552,26 @@ def _check_dist_attr(dist_attr):
     elif not isinstance(dist_attr, dict):
         raise TypeError(
             "The type of 'dist_attr' should be 'dict', "
-            "but got '{}'.".format(str(type(dist_attr)))
+            f"but got '{str(type(dist_attr))}'."
         )
     else:
         for name, value in dist_attr.items():
             if not isinstance(name, str):
                 raise TypeError(
                     "The type of param name of 'dist_attr' should be 'str', "
-                    "but got '{}'.".format(str(type(name)))
+                    f"but got '{str(type(name))}'."
                 )
             if not isinstance(value, dict):
                 raise TypeError(
                     "The type of distributed attribute should be 'dict', "
-                    "but got '{}'".format(str(type(value)))
+                    f"but got '{str(type(value))}'"
                 )
             attr = ['process_shape', 'process_group', 'dims_mapping']
             if list(value.keys()) != attr:
                 raise ValueError(
                     "The key of distributed attribute should be "
                     "'['process_shape', 'process_group', 'dims_mapping']', "
-                    "but got {}.".format(str(value.keys()))
+                    f"but got {str(value.keys())}."
                 )
         return dist_attr
 
@@ -878,9 +870,7 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
         if not isinstance(name, str):
             raise TypeError(
                 "The key of 'dist_param_dict' is parameter's name, "
-                "and its type should be 'str', but got {}.".format(
-                    str(type(name))
-                )
+                f"and its type should be 'str', but got {str(type(name))}."
             )
         if not isinstance(value, list) or not all(
             isinstance(v, np.ndarray) for v in value
@@ -1897,7 +1887,7 @@ def get_lr(optimizer):
     else:
         raise TypeError(
             "'optimizer' must be object of class `paddle.optimizer.Optimizer`"
-            " or `paddle.static.Optimizer`, but got {}.".format(type(optimizer))
+            f" or `paddle.static.Optimizer`, but got {type(optimizer)}."
         )
 
 
@@ -2045,9 +2035,7 @@ def set_recompute_segments(model, losses, strategy, program):
                 segments.append([min_idx, max_idx + 1])
             else:
                 logging.debug(
-                    "Could not recompute op range [{}] - [{}] ".format(
-                        min_idx, max_idx + 1
-                    )
+                    f"Could not recompute op range [{min_idx}] - [{max_idx + 1}] "
                 )
         start_idx += 1
 
@@ -2255,14 +2243,10 @@ def insert_dependencies_for_two_ops(
 
     assert (
         len(prior_op.output_arg_names) >= 1
-    ), "first op of dependency should at least have one output. [{}]".format(
-        str(prior_op)
-    )
+    ), f"first op of dependency should at least have one output. [{str(prior_op)}]"
     assert (
         len(posterior_op.input_arg_names) >= 1
-    ), "second op of dependency should at least have one input. [{}]".format(
-        str(posterior_op)
-    )
+    ), f"second op of dependency should at least have one input. [{str(posterior_op)}]"
     prior_op_mesh = dist_context.get_op_dist_attr_for_program(
         prior_op
     ).process_mesh
diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py
index 36607f0903bc7..1df4663b4fed5 100644
--- a/python/paddle/distributed/auto_parallel/strategy.py
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -26,9 +26,7 @@ def __init__(self, category, config_dict=None):
                 self._config_dict = config_dict
             else:
                 raise ValueError(
-                    "Expected a dictionary. But received: {}".format(
-                        config_dict
-                    )
+                    f"Expected a dictionary. But received: {config_dict}"
                 )
         # Initialize attributes by the default config
         config = constants.get_category_default_config(self._category)
diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index 3fd8ce5d16a3a..ff9908c09c96a 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -47,21 +47,17 @@ def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_devices):
 
     if node_ip != "127.0.0.1" and node_ip != args_node_ip:
         logger.warning(
-            "Please NOTE: When using paddlecloud, node_ip is \
-automatically got from POD_IP. Your input node_ip: {} doesn't equals to \
-node_ip: {} from paddlecloud environment.".format(
-                args_node_ip, node_ip
-            )
+            f"Please NOTE: When using paddlecloud, node_ip is \
+automatically got from POD_IP. Your input node_ip: {args_node_ip} doesn't equals to \
+node_ip: {node_ip} from paddlecloud environment."
         )
 
     if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
         logger.warning(
-            "Please NOTE: When using paddlecloud, cluster_node_ips is \
+            f"Please NOTE: When using paddlecloud, cluster_node_ips is \
 automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
-Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
-paddlecloud environment.".format(
-                args_node_ips, node_ips
-            )
+Your input cluster_node_ips: {args_node_ips} doesn't equals to IPs: {node_ips} from \
+paddlecloud environment."
         )
 
     # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
@@ -101,10 +97,8 @@ def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_devices):
             )
 
     logger.debug(
-        "parsed from args: node_ips:{} \
-        node_ip:{} node_rank:{} trainer_endpoints:{}".format(
-            node_ips, node_ip, node_rank, trainer_endpoints
-        )
+        f"parsed from args: node_ips:{node_ips} \
+        node_ip:{node_ip} node_rank:{node_rank} trainer_endpoints:{trainer_endpoints}"
     )
 
     cluster, pod = get_cluster(
diff --git a/python/paddle/distributed/communication/group.py b/python/paddle/distributed/communication/group.py
index dfab85404a1d0..d73e3ce90cbd2 100644
--- a/python/paddle/distributed/communication/group.py
+++ b/python/paddle/distributed/communication/group.py
@@ -78,8 +78,8 @@ def get_group_rank(self, rank):
             return -1
 
     def __repr__(self):
-        debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
-            self.rank, self.nranks, self.id
+        debug_str = (
+            f"rank: {self.rank}, nranks: {self.nranks}, id: {self.id}, ranks: "
         )
         debug_str += ", ".join(map(str, self.ranks))
         debug_str += "; name: "
@@ -112,9 +112,7 @@ def _warn_cur_rank_not_in_group(group):
     global_rank = dist.get_rank()
     if group and not group.is_member():
         warnings.warn(
-            "Current global rank {} is not in group {}".format(
-                global_rank, group.name
-            )
+            f"Current global rank {global_rank} is not in group {group.name}"
         )
         return True
     return False
@@ -124,9 +122,7 @@ def _get_or_throw_group_rank(global_rank, group):
     group_rank = group.get_group_rank(global_rank)
     assert (
         group_rank >= 0
-    ), "The input rank {} can not be found inside the group {}".format(
-        global_rank, group.name
-    )
+    ), f"The input rank {global_rank} can not be found inside the group {group.name}"
     return group_rank
 
 
diff --git a/python/paddle/distributed/communication/stream/gather.py b/python/paddle/distributed/communication/stream/gather.py
index 4cb8d65c9d56f..a729d0c644537 100644
--- a/python/paddle/distributed/communication/stream/gather.py
+++ b/python/paddle/distributed/communication/stream/gather.py
@@ -36,9 +36,7 @@ def _gather_in_dygraph(
 
     assert (
         len(gather_list) == nranks
-    ), " gather_list length {} and nrankd {} not equal".format(
-        len(gather_list), nranks
-    )
+    ), f" gather_list length {len(gather_list)} and nrankd {nranks} not equal"
 
     task = group.process_group.gather(
         tensor, gather_list, dst_rank_in_group, sync_op, use_calc_stream
diff --git a/python/paddle/distributed/fleet/base/orthogonal_strategy.py b/python/paddle/distributed/fleet/base/orthogonal_strategy.py
index c64260bf25140..aea73054722b2 100644
--- a/python/paddle/distributed/fleet/base/orthogonal_strategy.py
+++ b/python/paddle/distributed/fleet/base/orthogonal_strategy.py
@@ -130,9 +130,7 @@ def rank_in_strategy(self, name):
     def _check_valid_strategy(self):
         assert len(self._list_of_strategy_name) == len(
             set(self._list_of_strategy_name)
-        ), "Defined duplicated strategies: {}".format(
-            self._list_of_strategy_name
-        )
+        ), f"Defined duplicated strategies: {self._list_of_strategy_name}"
         num_of_ranks = functools.reduce(
             lambda x, y: x * y, self._list_of_degree
         )
@@ -145,9 +143,7 @@ def _check_valid_strategy(self):
             for strategy in fused_strategy:
                 assert (
                     strategy in self._list_of_strategy_name
-                ), "Can not fuse strategy {} without defined previous.".format(
-                    strategy
-                )
+                ), f"Can not fuse strategy {strategy} without defined previous."
 
     def _create_fused_group(self):
         for name in self._fused_strategy_dict:
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 130a7a338a898..7b9cf269dcd26 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -58,8 +58,8 @@ def __init__(self):
             "gloo is not initialized, will not communicator with other nodes"
         )
         self._err_type = "gloo initialized error, please check arguments"
-        self._err_world = "argument error, comm_world must in {}".format(
-            self._comm_world
+        self._err_world = (
+            f"argument error, comm_world must in {self._comm_world}"
         )
 
         self._is_initialized = False
@@ -1173,9 +1173,7 @@ def _gloo_init(self):
         else:
             type = "FILE"
         print(
-            "Gloo init with {}: need_init_all: {}, args: {}".format(
-                type, need_init_all, kwargs
-            )
+            f"Gloo init with {type}: need_init_all: {need_init_all}, args: {kwargs}"
         )
 
         self._gloo.init(
diff --git a/python/paddle/distributed/fleet/base/strategy_group.py b/python/paddle/distributed/fleet/base/strategy_group.py
index 634131cf33087..6ae9c13c5e548 100644
--- a/python/paddle/distributed/fleet/base/strategy_group.py
+++ b/python/paddle/distributed/fleet/base/strategy_group.py
@@ -239,6 +239,4 @@ def _create_p2p_group(self):
             and self._send_prev_group
             and self._recv_next_group
             and self._recv_prev_group
-        ), "Error occurs while creating p2p group for rank {}.".format(
-            self._rank
-        )
+        ), f"Error occurs while creating p2p group for rank {self._rank}."
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index 97543d9fc0476..309db09181b25 100755
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -470,9 +470,7 @@ def reader(batch_size, fn, dim):
             v for v in prog.list_vars() if paddle.static.io.is_persistable(v)
         ]
         print(
-            "persistable vars in dump program: {}".format(
-                [v.name for v in saved_params]
-            )
+            f"persistable vars in dump program: {[v.name for v in saved_params]}"
         )
 
         def check_not_expected_ops(prog, not_expected_op_types):
@@ -665,9 +663,7 @@ def check_not_expected_ops(prog, not_expected_op_types):
                 )
             else:
                 print(
-                    "load feed vars from files: {}.".format(
-                        feed_config.feeded_vars_filelist
-                    )
+                    f"load feed vars from files: {feed_config.feeded_vars_filelist}."
                 )
                 feed_vars = [
                     inference_program.global_block().var(
diff --git a/python/paddle/distributed/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
index 75df0fae32d1b..fcb2cf745bfd3 100644
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -44,12 +44,10 @@ def get_cloud_cluster(
 
     if args_node_ips != "127.0.0.1" and args_node_ips != ",".join(node_ips):
         logger.warning(
-            "Please NOTE: When using paddlecloud, cluster_node_ips is \
+            f"Please NOTE: When using paddlecloud, cluster_node_ips is \
 automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
-Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
-paddlecloud environment.".format(
-                args_node_ips, node_ips
-            )
+Your input cluster_node_ips: {args_node_ips} doesn't equals to IPs: {node_ips} from \
+paddlecloud environment."
         )
 
     # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
@@ -89,10 +87,8 @@ def get_cloud_cluster(
             )
 
     logger.debug(
-        "parsed from args: node_ips:{} \
-        node_ip:{} node_rank:{} trainer_endpoints:{}".format(
-            node_ips, node_ip, node_rank, trainer_endpoints
-        )
+        f"parsed from args: node_ips:{node_ips} \
+        node_ip:{node_ip} node_rank:{node_rank} trainer_endpoints:{trainer_endpoints}"
     )
 
     cluster, pod = get_cluster(
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index 00151a8dee5f1..6c3810f7aae74 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -229,9 +229,7 @@ def __init__(self, args, etcd_client):
         node_tag = ''.join(
             random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6)
         )
-        self.host_path = '{}/{}{}'.format(
-            self.node_prefix, node_tag, time.time()
-        )
+        self.host_path = f'{self.node_prefix}/{node_tag}{time.time()}'
         '''
         0 group mode, be aware of healthy status of other workers
         1 decouple mode, check own status only
@@ -280,9 +278,7 @@ def lease_heartbeat():
                         )
                 except Exception as e:
                     logger.error(
-                        "[lease_heartbeat] internal error:{} {}".format(
-                            e, traceback.format_exc()
-                        )
+                        f"[lease_heartbeat] internal error:{e} {traceback.format_exc()}"
                     )
                     break
                 time.sleep(elastic_ttl / 3)
@@ -309,9 +305,7 @@ def endpoints_call_back(event):
             edps = value.decode() if value is not None else ''
             self.dist_endpoints, self.trainers = edps.split('|')
             logger.info(
-                "set DISTRIBUTED_TRAINER_ENDPOINTS {} ".format(
-                    self.dist_endpoints
-                )
+                f"set DISTRIBUTED_TRAINER_ENDPOINTS {self.dist_endpoints} "
             )
             logger.info(f"set PADDLE_TRAINERS {self.trainers} ")
 
@@ -472,9 +466,7 @@ def _update_fault_tolrance(self):
             os.environ['DISTRIBUTED_TRAINER_ENDPOINTS'] = self.dist_endpoints
             os.environ['PADDLE_TRAINERS'] = self.trainers
             logger.info(
-                "update env DISTRIBUTED_TRAINER_ENDPOINTS {} ".format(
-                    self.dist_endpoints
-                )
+                f"update env DISTRIBUTED_TRAINER_ENDPOINTS {self.dist_endpoints} "
             )
             logger.info(f"update env PADDLE_TRAINERS {self.trainers} ")
             return
@@ -502,9 +494,9 @@ def _update_elastic_scale_out(self):
             if curr_host_port not in host_endpoints:
                 host_endpoints.append(curr_host_port)
 
-        os.environ['PADDLE_TRAINER_ID'] = '{}'.format(
-            host_endpoints.index(self.curr_host)
-        )
+        os.environ[
+            'PADDLE_TRAINER_ID'
+        ] = f'{host_endpoints.index(self.curr_host)}'
         hosts = ','.join(
             [host_port.split(":")[0] for host_port in host_endpoints]
         )
@@ -555,9 +547,9 @@ def _update_elastic_scale_in(self):
         )
 
         self.args.ips = hosts
-        os.environ['PADDLE_TRAINER_ID'] = '{}'.format(
-            sorted_endpoints.index(self.curr_host)
-        )
+        os.environ[
+            'PADDLE_TRAINER_ID'
+        ] = f'{sorted_endpoints.index(self.curr_host)}'
         os.environ['PADDLE_TRAINERS'] = hosts
         self.np = len(sorted_endpoints)
         os.environ['PADDLE_TRAINER_ENDPOINTS'] = ','.join(sorted_endpoints)
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 4a334281e90d1..5a6b5665647de 100755
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -274,9 +274,7 @@ def get_cluster_from_args(args, device_mode, devices_per_proc):
     node_rank = node_ips.index(node_ip)
 
     logger.debug(
-        "parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
-            node_ips, node_ip, node_rank
-        )
+        f"parsed from args: node_ips:{node_ips} node_ip:{node_ip} node_rank:{node_rank}"
     )
 
     free_ports = None
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 2b48f29c57018..ac51a9b8a08bb 100755
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -583,8 +583,8 @@ def start_local_trainers(
             )
             logger.info(
                 "details about PADDLE_TRAINER_ENDPOINTS can be found in "
-                "{}/endpoints.log, and detail running logs maybe found in "
-                "{}/workerlog.0".format(log_dir, log_dir)
+                f"{log_dir}/endpoints.log, and detail running logs maybe found in "
+                f"{log_dir}/workerlog.0"
             )
         fn = None
         pre_fn = None if os.name == 'nt' else os.setsid
@@ -699,20 +699,16 @@ def get_gpus(gpus):
             for x in gpus.split(','):
                 assert x in cuda_visible_devices_list, (
                     "Can't find "
-                    "your gpus {} in CUDA_VISIBLE_DEVICES[{}].".format(
-                        x, cuda_visible_devices
-                    )
+                    f"your gpus {x} in CUDA_VISIBLE_DEVICES[{cuda_visible_devices}]."
                 )
             res_gpus = [
                 cuda_visible_devices_list.index(x.strip())
                 for x in gpus.split(',')
             ]
             logger.info(
-                "Change selected_gpus into reletive values. --ips:{} "
-                "will change into relative_ips:{} according to your "
-                "CUDA_VISIBLE_DEVICES:{}".format(
-                    gpus, res_gpus, cuda_visible_devices_list
-                )
+                f"Change selected_gpus into reletive values. --ips:{gpus} "
+                f"will change into relative_ips:{res_gpus} according to your "
+                f"CUDA_VISIBLE_DEVICES:{cuda_visible_devices_list}"
             )
 
     return res_gpus
@@ -734,21 +730,16 @@ def get_xpus(xpus):
             for x in xpus.split(','):
                 assert x in xpu_visible_devices_list, (
                     "Can't find "
-                    "your xpus {} in XPU_VISIBLE_DEVICES[{}].".format(
-                        x,
-                        xpu_visible_devices,
-                    )
+                    f"your xpus {x} in XPU_VISIBLE_DEVICES[{xpu_visible_devices}]."
                 )
             res_xpus = [
                 xpu_visible_devices_list.index(x.strip())
                 for x in xpus.split(',')
             ]
             logger.info(
-                "Change selected_xpus into reletive values. --ips:{} "
-                "will change into relative_ips:{} according to your "
-                "XPU_VISIBLE_DEVICES:{}".format(
-                    xpus, res_xpus, xpu_visible_devices_list
-                )
+                f"Change selected_xpus into reletive values. --ips:{xpus} "
+                f"will change into relative_ips:{res_xpus} according to your "
+                f"XPU_VISIBLE_DEVICES:{xpu_visible_devices_list}"
             )
 
     return res_xpus
@@ -826,9 +817,7 @@ def get_device_proc_info(args):
             devices_per_proc = list(range(0, args.nproc_per_node))
     else:
         raise AssertionError(
-            "Can't support device_mode:{}, support only cpu|gpu|xpu now.".format(
-                device_mode
-            )
+            f"Can't support device_mode:{device_mode}, support only cpu|gpu|xpu now."
         )
 
     return (device_mode, devices_per_proc)
@@ -965,10 +954,8 @@ def get_mapped_cluster_from_args_without_rank_mapping(args, device_mode):
     ), "ranks length should be equal to ips length."
 
     logger.debug(
-        "parsed from args: node_ips:{} node_ip:{} "
-        "node_rank:{} node_ranks:{}".format(
-            node_ips, node_ip, node_rank, node_ranks[node_rank]
-        )
+        f"parsed from args: node_ips:{node_ips} node_ip:{node_ip} "
+        f"node_rank:{node_rank} node_ranks:{node_ranks[node_rank]}"
     )
 
     # NOTE: there are different number of global mapped ranks on each node.
@@ -1102,10 +1089,8 @@ def get_mapped_cluster_from_args_with_rank_mapping(args, device_mode):
     ), "ranks length should be equal to ips length."
 
     logger.debug(
-        "parsed from args: node_ips:{} node_ip:{} "
-        "node_rank:{} node_ranks:{}".format(
-            node_ips, node_ip, node_rank, node_ranks[node_rank]
-        )
+        f"parsed from args: node_ips:{node_ips} node_ip:{node_ip} "
+        f"node_rank:{node_rank} node_ranks:{node_ranks[node_rank]}"
     )
 
     # NOTE: there are different number of global mapped ranks on each node.
@@ -1515,20 +1500,14 @@ def start_ps(self):
             for i in range(len(self.server_endpoints_ips)):
                 if ip == self.server_endpoints_ips[i]:
                     server = Trainer()
-                    server.endpoint = "{}:{}".format(
-                        ip,
-                        self.server_endpoints_port[i],
-                    )
+                    server.endpoint = f"{ip}:{self.server_endpoints_port[i]}"
                     server.rank = server_rank
                     server_rank += 1
                     pod.servers.append(server)
             for j in range(len(self.worker_endpoints_ips)):
                 if ip == self.worker_endpoints_ips[j]:
                     worker = Trainer()
-                    worker.endpoint = "{}:{}".format(
-                        ip,
-                        self.worker_endpoints_port[j],
-                    )
+                    worker.endpoint = f"{ip}:{self.worker_endpoints_port[j]}"
                     worker.rank = worker_rank
                     worker.stage = 1
                     worker_rank += 1
@@ -1536,9 +1515,8 @@ def start_ps(self):
             for m in range(len(self.coordinator_endpoints_ips)):
                 if ip == self.coordinator_endpoints_ips[m]:
                     coordinator = Trainer()
-                    coordinator.endpoint = "{}:{}".format(
-                        ip,
-                        self.coordinator_endpoints_port[m],
+                    coordinator.endpoint = (
+                        f"{ip}:{self.coordinator_endpoints_port[m]}"
                     )
                     coordinator.rank = coordinator_rank
                     coordinator.stage = 1
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
index c24062c1f392b..67b88cb52ab45 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -387,10 +387,8 @@ def __init__(
 
         self.gather_output = gather_output
         assert out_features % self.world_size == 0, (
-            "Number of column of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(
-                out_features, self.world_size
-            )
+            f"Number of column of the weight for linear ({out_features}) must be"
+            f" divisible by model parallel size ({self.world_size})"
         )
         self.output_size_per_partition = out_features // self.world_size
 
@@ -631,10 +629,8 @@ def __init__(
                 paddle.in_dynamic_mode()
             ), "mp_async_allreduce, mp_skip_c_identity and mp_fused_linear_param_grad_add are only available under dygraph mode"
         assert in_features % self.world_size == 0, (
-            "Number of row of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(
-                in_features, self.world_size
-            )
+            f"Number of row of the weight for linear ({in_features}) must be"
+            f" divisible by model parallel size ({self.world_size})"
         )
 
         self.input_size_per_partition = in_features // self.world_size
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
index 27d8a32f0693f..5a726dd5ab141 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -429,10 +429,8 @@ def _c_softmax_with_cross_entropy(
     label_dims = len(list(label.shape))
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
-            'Expected input_dims - 1 = label_dims or input_dims == label_dims\
-             (got input_dims{}, label_dims{})'.format(
-                input_dims, label_dims
-            )
+            f'Expected input_dims - 1 = label_dims or input_dims == label_dims\
+             (got input_dims{input_dims}, label_dims{label_dims})'
         )
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=-1)
@@ -842,9 +840,7 @@ def split(
     ]
     assert operation in supported_operations, (
         "The operation for "
-        "paddle.distributed.split must be one of {}.".format(
-            supported_operations
-        )
+        f"paddle.distributed.split must be one of {supported_operations}."
     )
     if in_dynamic_mode():
         raise ValueError(
@@ -872,9 +868,7 @@ def split(
         )
         assert size[0] % num_partitions == 0, (
             "The length of the vocabulary must be divisible by num_partitions "
-            "but received vocabulary={} num_partitions={}".format(
-                size[0], num_partitions
-            )
+            f"but received vocabulary={size[0]} num_partitions={num_partitions}"
         )
 
         per_part_size = size[0] // num_partitions
@@ -893,10 +887,8 @@ def split(
         should_split = False
         if axis == 0:
             assert size[0] % num_partitions == 0, (
-                "Number of rows of the weight for linear ({}) must be"
-                " divisible by num_partitions ({})".format(
-                    size[0], num_partitions
-                )
+                f"Number of rows of the weight for linear ({size[0]}) must be"
+                f" divisible by num_partitions ({num_partitions})"
             )
             per_part_size = size[0] // num_partitions
             linear_size = (per_part_size, size[1])
@@ -905,17 +897,15 @@ def split(
 
         elif axis == 1:
             assert size[1] % num_partitions == 0, (
-                "Number of column of the weight for linear ({}) must be"
-                " divisible by num_partitions ({})".format(
-                    size[1], num_partitions
-                )
+                f"Number of column of the weight for linear ({size[1]}) must be"
+                f" divisible by num_partitions ({num_partitions})"
             )
             per_part_size = size[1] // num_partitions
             linear_size = (size[0], per_part_size)
         else:
             raise ValueError(
                 "The value of axis must be 0 or 1, but the value "
-                "given is {}.".format(axis)
+                f"given is {axis}."
             )
 
         linear_out = _parallel_linear(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index 1190a03774129..071e1a07ce027 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -206,9 +206,7 @@ def _partition_parameters(self):
             numel = reduce(lambda x, y: x * y, param.shape, 1)
             assert (
                 numel > 0
-            ), "param [{}] should larger than 0, but it is [{}]".format(
-                param.name, numel
-            )
+            ), f"param [{param.name}] should larger than 0, but it is [{numel}]"
             sizes[rank] += numel
 
         return mapping
@@ -341,9 +339,7 @@ def step(self):
                     and param.regularizer is not None
                 ):
                     raise ValueError(
-                        "param {} should not has the regularizer attribute".format(
-                            param.name
-                        )
+                        f"param {param.name} should not has the regularizer attribute"
                     )
                 if param.stop_gradient:
                     continue
@@ -406,9 +402,7 @@ def _set_inner_opt_attr(self, attr_name, value):
         inner_opt_name = '_inner_opt'
         if not isinstance(attr_name, str):
             raise TypeError(
-                "attr_name should be str type, but is {}".format(
-                    type(attr_name)
-                )
+                f"attr_name should be str type, but is {type(attr_name)}"
             )
         while hasattr(inner_opt, attr_name):
             setattr(inner_opt, attr_name, value)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 79bcc134656f5..45279379ec3fd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -53,16 +53,12 @@ def _can_update(self, optimizer):
 
     def _disable_strategy(self, dist_strategy):
         raise NotImplementedError(
-            "you should implement disable strategy in {}".format(
-                type(self).__name__
-            )
+            f"you should implement disable strategy in {type(self).__name__}"
         )
 
     def _enable_strategy(self, dist_strategy, context=None):
         raise NotImplementedError(
-            "you should implement enable strategy in {}".format(
-                type(self).__name__
-            )
+            f"you should implement enable strategy in {type(self).__name__}"
         )
 
     def apply_gradients(self, params_grads):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
index 5c8038c221558..0af5824ce3b6f 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -103,7 +103,7 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_ids):
             if param_name not in shard.global_params:
                 raise ValueError(
                     "Output 'X' of cast_op must be a grad of"
-                    "model param, but {} is not a grad".format(output_name)
+                    f"model param, but {output_name} is not a grad"
                 )
             if output_name in reduced_grads_to_param:
                 continue
@@ -131,7 +131,7 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_ids):
                     if param_name not in shard.global_params:
                         raise ValueError(
                             "Input 'X' of check_finite_and_unscale must"
-                            "be grads, but {} is not a grad".format(input_name)
+                            f"be grads, but {input_name} is not a grad"
                         )
                     if shard.has_param(param_name):
                         reversed_x.append(input_name)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index 44a584ac6d0b2..f3301cbeb24d5 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -167,14 +167,14 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
                         _status = dp_grads_status[var_name]
                     if _status == -1:
                         raise ValueError(
-                            "{} is not generated, but you are"
-                            "trying to all-reduce it".format(var_name)
+                            f"{var_name} is not generated, but you are"
+                            "trying to all-reduce it"
                         )
                     if _status == 0:
                         raise ValueError(
                             "There should be a sync_calc op "
-                            "after generate Var: {} and before the"
-                            "c_allreduce_sum op".format(var_name)
+                            f"after generate Var: {var_name} and before the"
+                            "c_allreduce_sum op"
                         )
                     assert _status == 1
                     if var_name in vars_status:
@@ -212,7 +212,7 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
                     if vars_status[input_name] != 3:
                         raise ValueError(
                             "There should be a sync_comm op "
-                            "after allreduce the Var: {}".format(input_name)
+                            f"after allreduce the Var: {input_name}"
                         )
                     raise ValueError(
                         "The reduce output grad [{}] should NOT be be used in Non-root rank.".format(
@@ -224,13 +224,13 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
                         if dp_grads_status[input_name] != 3:
                             raise ValueError(
                                 "There should be a sync_comm op "
-                                "after allreduce the Var: {}".format(input_name)
+                                f"after allreduce the Var: {input_name}"
                             )
                     else:
                         if dp_grads_status[input_name] != 5:
                             raise ValueError(
                                 "The grad in shard should be allreduce and sync"
-                                "twice before usage {}".format(input_name)
+                                f"twice before usage {input_name}"
                             )
 
             for output_name in op.desc.output_arg_names():
@@ -538,9 +538,7 @@ def insert_fused_reduce_ops(
         root_id = get_grad_device(var, shard)
         assert 0 <= root_id < nranks, (
             "root_id should >=0 and < nranks, "
-            "but now nranks={}, the root_id of var={} is {}".format(
-                nranks, var, root_id
-            )
+            f"but now nranks={nranks}, the root_id of var={var} is {root_id}"
         )
         device_to_vars[root_id].append(var)
 
@@ -621,9 +619,7 @@ def insert_reduce_ops(
         root_id = get_grad_device(grad_var, shard)
         assert (
             root_id >= 0
-        ), "root id should be a positive int, but now root id is {}".format(
-            root_id
-        )
+        ), f"root id should be a positive int, but now root id is {root_id}"
         if rank is not None and rank == root_id:
             grad_in_this_device.append(var)
         block._insert_op_without_sync(
@@ -660,9 +656,7 @@ def insert_fused_broadcast_param_ops(
         root_id = shard.device(var)
         assert 0 <= root_id < nranks, (
             "root_id should >=0 and < nranks, "
-            "but now nranks={}, the root_id of var={} is {}".format(
-                nranks, var, root_id
-            )
+            f"but now nranks={nranks}, the root_id of var={var} is {root_id}"
         )
         device_to_vars[root_id].append(var)
 
@@ -731,9 +725,7 @@ def insert_broadcast_param_ops(
         root_id = shard.device(param)
         assert (
             root_id >= 0
-        ), "root id should be a positive int, but now root id is {}".format(
-            root_id
-        )
+        ), f"root id should be a positive int, but now root id is {root_id}"
         if rank is not None and rank == root_id:
             param_in_this_device.append(param)
         block._insert_op_without_sync(
@@ -801,9 +793,7 @@ def fuse_opt_broadcast_param_ops(
 
 
 def get_grad_device(grad_name, shard):
-    assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
-        grad_name
-    )
+    assert "@GRAD" in grad_name, f"[{grad_name}] should be a grad variable."
     base_name = None
     # NOTE: mind the traversal order
     possible_suffixes = [
@@ -905,7 +895,7 @@ def insert_scale_loss_grad_ops(block, scale=1.0):
         if is_loss_grad_op(op):
             assert op.type == 'fill_constant', (
                 "loss_grad_op must be fill_constant op, "
-                "but this op is {}".format(op.type)
+                f"but this op is {op.type}"
             )
             assert op.has_attr('value')
             loss_scale = float(op.attr('value'))
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
index 0a841cf243d14..2ff259be18b79 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
@@ -33,7 +33,7 @@ def prune_weight_decay(self, block, shard):
             if OP_ROLE_VAR_KEY not in op.attr_names:
                 raise ValueError(
                     "The Weight Dacay op should hold op_role_var attribute"
-                    "but the {} op does not hold op_role_var".format(op.type)
+                    f"but the {op.type} op does not hold op_role_var"
                 )
             op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
             if not shard.has_param(op_role_var[0]):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index be6aad3208d09..1ee99b10854b9 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -248,9 +248,7 @@ def _get_hybrid_dp_mode(self):
             self.scale_gradient = gradient_scale_configs['scale_gradient']
         if gm_acc_step > 1:
             logger.info(
-                "Gradient merge in [{}], acc step = [{}]".format(
-                    gm_mode, gm_acc_step
-                )
+                f"Gradient merge in [{gm_mode}], acc step = [{gm_acc_step}]"
             )
 
         optimizer_sharding = False
@@ -865,9 +863,7 @@ def _split_program(self, block):
                             )
                             assert (
                                 input_name not in self._forward_remain_anchors
-                            ), "segment anchor [{}] met twice !".format(
-                                input_name
-                            )
+                            ), f"segment anchor [{input_name}] met twice !"
                             self._backward_remain_anchors.remove(input_name)
                             self._forward_remain_anchors.append(input_name)
                 elif int(op.attr('op_role')) == int(OpRole.Forward):
@@ -1766,9 +1762,7 @@ def create_persistable_gradients_and_insert_merge_ops(
         for grad_name in grad_names:
             assert (
                 get_grad_device(grad_name, shard) == shard.worker_idx
-            ), "try to merge gradient not belong to current shard: [{}]".format(
-                grad_name
-            )
+            ), f"try to merge gradient not belong to current shard: [{grad_name}]"
             persistable_grad_name = grad_name + '@GradiantMerge'
             assert (
                 grad_name not in self._grad2merged_grad
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index 8773f9d82ca8b..4222d80a4e374 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -120,9 +120,7 @@ def check_sanity():
                     assert part >= 0, f"part[{part}] should be greater than 0"
                     assert (
                         part <= self.num_items
-                    ), "part[{}] should be less than num_items[{}]".format(
-                        part, self.num_items
-                    )
+                    ), f"part[{part}] should be less than num_items[{self.num_items}]"
 
             check_sanity()
 
@@ -391,10 +389,8 @@ def __init__(
             # construct default topology
             if world_size % num_stages != 0:
                 raise ValueError(
-                    "should provide correct num_stages({}) "
-                    "which can be divided by world_size({})".format(
-                        num_stages, world_size
-                    )
+                    f"should provide correct num_stages({num_stages}) "
+                    f"which can be divided by world_size({world_size})"
                 )
             dp_num = world_size // num_stages
             self._topo = fleet.CommunicateTopology(
@@ -754,10 +750,8 @@ def forward(self, input, chunk_id=None):
                 self._num_virtual_pipeline_stages > 1
             ), "chunk_id is only valid when using virtual pipeline stage"
             assert chunk_id < len(self._model_chunks), (
-                "The virtual pipeline only has {} chunks, "
-                "but received chunk_id {}.".format(
-                    len(self._model_chunks), chunk_id
-                )
+                f"The virtual pipeline only has {len(self._model_chunks)} chunks, "
+                f"but received chunk_id {chunk_id}."
             )
             # Get the target model chunk.
             model_chunk = self._model_chunks[chunk_id]
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index d90185d36d466..a3e5b406be79e 100755
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -250,9 +250,7 @@ def __init__(self, layers, hcg, strategy):
         self._compute_loss = True
 
         logger.info(
-            "Pipeline Info -- num_stages: {}, stage_id: {}".format(
-                self.num_stages, self.stage_id
-            )
+            f"Pipeline Info -- num_stages: {self.num_stages}, stage_id: {self.stage_id}"
         )
 
         if self.use_model_parallel:
diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
index bdbdc797b2b7b..4600b78702b75 100644
--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -112,9 +112,7 @@ def forward(
             or 'xpu:' in paddle.get_device()
             or cur_device.split(':')[0]
             in paddle.device.get_all_custom_device_type()
-        ), "Recompute with RNG is not support current device: {}.".format(
-            cur_device
-        )
+        ), f"Recompute with RNG is not support current device: {cur_device}."
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index ee3bd60b46b9e..865571cfeca6f 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -111,9 +111,7 @@ def _in_varnames(var):
             var_path = os.path.join(dirname, origin_varname)
             if not os.path.exists(var_path):
                 raise ValueError(
-                    "SelectedRows var {} can not find at {}".format(
-                        new_var.name, var_path
-                    )
+                    f"SelectedRows var {new_var.name} can not find at {var_path}"
                 )
 
             if os.path.isfile(var_path):
@@ -309,9 +307,7 @@ def _get_executor(self):
             )
             if heter_worker_device_guard not in ["GPU", "XPU", "CPU"]:
                 raise ValueError(
-                    "Heter Worker Not Support Device {}".format(
-                        heter_worker_device_guard
-                    )
+                    f"Heter Worker Not Support Device {heter_worker_device_guard}"
                 )
             if self.role_maker._is_heter_worker():
                 if heter_worker_device_guard == "GPU":
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index 43440ba37fc3a..a14c337a4fad1 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -480,9 +480,7 @@ def to_string(self, indent):
         attrs += f"fetch_var_name: \"{str(self.fetch_var_name)}\" "
         attrs += f"startup_program_id: {str(self.startup_program_id)} "
         attrs += f"main_program_id: {str(self.main_program_id)} "
-        attrs += "tensor_table_class: \"{}\" ".format(
-            str(self.tensor_table_class)
-        )
+        attrs += f"tensor_table_class: \"{str(self.tensor_table_class)}\" "
         attrs += "\n"
         return program_str.format(
             conv_indent(indent), attrs, conv_indent(indent)
@@ -898,9 +896,7 @@ def _get_executor(self):
                 heter_device_type = self.role_maker._heter_device_type().upper()
                 if heter_device_type not in ["GPU", "XPU", "CPU"]:
                     raise ValueError(
-                        "Heter Worker Not Support Device {}".format(
-                            heter_device_type
-                        )
+                        f"Heter Worker Not Support Device {heter_device_type}"
                     )
                 if heter_device_type == "GPU":
                     executor = Executor(
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 770cef9c551e6..11617981d9d4b 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -171,9 +171,7 @@ def mkdirs(self, fs_path):
                 client.mkdirs("test_mkdirs")
                 client.delete("test_mkdirs")
         """
-        assert not os.path.isfile(fs_path), "{} is already a file".format(
-            fs_path
-        )
+        assert not os.path.isfile(fs_path), f"{fs_path} is already a file"
         os.makedirs(fs_path, exist_ok=True)
 
     def rename(self, fs_src_path, fs_dst_path):
@@ -401,9 +399,7 @@ def handler(*args, **kwargs):
                 except ExecuteError as e:
                     if time.time() - start >= time_out:
                         raise FSTimeOut(
-                            "args:{} timeout:{}".format(
-                                args, time.time() - start
-                            )
+                            f"args:{args} timeout:{time.time() - start}"
                         )
 
                     time.sleep(inter)
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index a5723f856e661..9c44fc49fff67 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -272,9 +272,9 @@ def _init_communication_group(self):
                 dev_ids.append(cur_id)
         num_pp = len(dev_ids)
         num_pp = max(1, num_pp)
-        assert num_pp == self.num_pp, 'num_pp: {}, self.num_pp: {}'.format(
-            num_pp, self.num_pp
-        )
+        assert (
+            num_pp == self.num_pp
+        ), f'num_pp: {num_pp}, self.num_pp: {self.num_pp}'
 
         collective_helper = fleet.meta_optimizers.common.CollectiveHelper(
             self.role_maker, wait_port=False
@@ -533,9 +533,7 @@ def _check_validation(self, block):
             )
 
             device = op.attr(self._op_device_key)
-            assert device, "{} has no {} set.".format(
-                op.type, self._op_device_key
-            )
+            assert device, f"{op.type} has no {self._op_device_key} set."
             if device.split(':')[1] == "all":
                 continue
 
diff --git a/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py
index 1a8dd92fc8518..ad640a7200d0d 100644
--- a/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py
+++ b/python/paddle/distributed/fleet/utils/pp_parallel_adaptor.py
@@ -544,9 +544,9 @@ def parse_args():
     if args.dst_pp is None:
         args.dst_pp = args.src_pp
 
-    assert args.src_mp == args.dst_mp, "src mp {} dst mp {}".format(
-        args.src_mp, args.dst_mp
-    )
+    assert (
+        args.src_mp == args.dst_mp
+    ), f"src mp {args.src_mp} dst mp {args.dst_mp}"
 
     assert args.method in [
         'peek_model',
diff --git a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
index ae5dec21b4e8f..1b492bb00f560 100644
--- a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
@@ -258,10 +258,8 @@ def __init__(
 
         self.gather_output = gather_output
         assert out_features % self.world_size == 0, (
-            "Number of column of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(
-                out_features, self.world_size
-            )
+            f"Number of column of the weight for linear ({out_features}) must be"
+            f" divisible by model parallel size ({self.world_size})"
         )
         self.output_size_per_partition = out_features // self.world_size
 
@@ -380,10 +378,8 @@ def __init__(
 
         self.is_mp = self.world_size > 1
         assert in_features % self.world_size == 0, (
-            "Number of row of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(
-                in_features, self.world_size
-            )
+            f"Number of row of the weight for linear ({in_features}) must be"
+            f" divisible by model parallel size ({self.world_size})"
         )
 
         self.input_size_per_partition = in_features // self.world_size
diff --git a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
index e370042927434..9ca0a7fdfc89f 100644
--- a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
@@ -268,9 +268,7 @@ def insert_synchronization(
 
     assert (
         len(unsync_param_names) == 0
-    ), "The following param is unsync by some error: {}".format(
-        unsync_param_names
-    )
+    ), f"The following param is unsync by some error: {unsync_param_names}"
 
 
 def add_extra_synchronization(
@@ -308,9 +306,7 @@ def add_extra_synchronization(
 
     logger.info("Constructing Extra Parameter Synchronization.")
     logger.info(
-        "Tensor Parallel Degree: {}, Synchronization mode: {}".format(
-            tp_degree, sync_mode
-        )
+        f"Tensor Parallel Degree: {tp_degree}, Synchronization mode: {sync_mode}"
     )
 
     # adopt for pipeline opt
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
index 5126f687ccb0a..047812ea965e8 100644
--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -132,8 +132,8 @@ def get_custom_devices_count(device_type):
             custom_device_type = os.getenv('PADDLE_XCCL_BACKEND')
             dev._dtype = DeviceType.CUSTOM_DEVICE
             num = get_custom_devices_count(custom_device_type)
-            visible_devices_str = '{}_VISIBLE_DEVICES'.format(
-                custom_device_type.upper()
+            visible_devices_str = (
+                f'{custom_device_type.upper()}_VISIBLE_DEVICES'
             )
             if visible_devices_str in os.environ:
                 visible_devices = os.getenv(visible_devices_str)
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 35bd244bb2f89..13d8ef403504a 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -75,9 +75,7 @@ def _build_pod_with_tuner(self):
                     "PADDLE_CURRENT_ENDPOINT": endpoint,
                     "FLAGS_selected_gpus": "0",
                     "PADDLE_AUTO_PARALLEL_STAGE": "tuner",
-                    "PADDLE_GLOBAL_SIZE": "{}".format(
-                        pod_replicas * int(self.ctx.args.nnodes)
-                    ),
+                    "PADDLE_GLOBAL_SIZE": f"{pod_replicas * int(self.ctx.args.nnodes)}",
                     "PADDLE_LOCAL_SIZE": f"{pod_replicas}",
                 }
                 log_file = "tuner.log"
diff --git a/python/paddle/distributed/launch/job/pod.py b/python/paddle/distributed/launch/job/pod.py
index 85cf1fed34be4..30159482081d2 100644
--- a/python/paddle/distributed/launch/job/pod.py
+++ b/python/paddle/distributed/launch/job/pod.py
@@ -46,8 +46,8 @@ def __init__(self):
         super().__init__()
 
     def __str__(self):
-        return "Pod: {}, replicas {}, status {}".format(
-            self.name, self.replicas, self.status
+        return (
+            f"Pod: {self.name}, replicas {self.replicas}, status {self.status}"
         )
 
     def failed_container(self):
diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
index 23e58b0e65f79..158abb7a5d9b7 100644
--- a/python/paddle/distributed/launch/plugins/__init__.py
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -71,9 +71,7 @@ def test_mode(ctx):
         ctx.logger.info('Paddle Distributed Test begin...')
         if int(ctx.args.nnodes) < 2:
             ctx.args.nnodes = 2
-        ctx.args.training_script = '{}/test.py'.format(
-            os.path.dirname(__file__)
-        )
+        ctx.args.training_script = f'{os.path.dirname(__file__)}/test.py'
 
 
 enabled_plugins = [
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 843c9eb5d9c0e..3815d0f475fbe 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -700,8 +700,8 @@ def __init__(self):
 
         # imperative only support one gpu or xpu
         if self._device_type != "":
-            FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format(
-                self._device_type
+            FLAGS_selected_custom_devices = (
+                f'FLAGS_selected_{self._device_type}s'
             )
             selected_custom_devices = os.getenv(
                 FLAGS_selected_custom_devices, "0"
@@ -1014,8 +1014,8 @@ def train():
         )
 
     if backend == "xccl":
-        FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format(
-            parallel_env.device_type
+        FLAGS_selected_custom_devices = (
+            f'FLAGS_selected_{parallel_env.device_type}s'
         )
         _check_var_exists(FLAGS_selected_custom_devices)
     else:
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index 322adfb5da310..53bdca47c48a5 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -215,9 +215,7 @@ def build_state(self):
                         fwd_op_id = self.grad_op_to_op_map[
                             op.desc.original_id()
                         ]
-                        assert fwd_op_id in self._op_fp16_dict, "{}".format(
-                            str(op)
-                        )
+                        assert fwd_op_id in self._op_fp16_dict, f"{str(op)}"
                         self._op_fp16_dict[
                             op.desc.original_id()
                         ] = self._is_fp16_op(fwd_op_id)
@@ -390,17 +388,13 @@ def _cast_block(self, block):
                     for in_var_name in op.input_arg_names:
                         assert (
                             in_var.dtype == block.var(in_var_name).dtype
-                        ), "{}, {}, {}".format(
-                            in_var, block.var(in_var_name), str(op)
-                        )
+                        ), f"{in_var}, {block.var(in_var_name)}, {str(op)}"
                     out_var.desc.set_dtype(in_var.dtype)
                 elif int(op.attr('op_role')) == 257:
                     pass
                 else:
                     raise ValueError(
-                        "'{}' op is not supported in the complete amp pass.".format(
-                            op.type
-                        )
+                        f"'{op.type}' op is not supported in the complete amp pass."
                     )
             idx += num_cast_ops + 1
         block._sync_with_cpp()
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index acc99d562aac4..3cda24f1a0f64 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -217,7 +217,7 @@ def _scale_backward_initial_grad(self):
             if is_loss_grad_op(op):
                 assert op.type == 'fill_constant', (
                     "loss_grad_op must be fill_constant op, "
-                    "but this op is {}".format(op.type)
+                    f"but this op is {op.type}"
                 )
                 assert op.has_attr('value')
                 loss_scale = float(op.attr('value'))
@@ -498,9 +498,7 @@ def _update_program(self, grad_groups):
             allreduce_op = block.ops[group.allreduce_op_idx]
             assert (
                 allreduce_op.type == 'c_allreduce_sum'
-            ), "should found c_allreduce_sum op but found {}".format(
-                str(allreduce_op)
-            )
+            ), f"should found c_allreduce_sum op but found {str(allreduce_op)}"
             allreduce_op_dist_attr = (
                 self.dist_context.get_op_dist_attr_for_program(allreduce_op)
             )
@@ -699,9 +697,7 @@ def summary(self, grad_groups=[]):
                 fused_grads
             )
             self._logger.debug(
-                "the following [{}] gradients are not fused: ".format(
-                    len(individual_grads)
-                )
+                f"the following [{len(individual_grads)}] gradients are not fused: "
             )
             self._logger.debug(f"individual gradient {individual_grads}")
 
@@ -764,9 +760,7 @@ def add(self, grad_var, ring_id, i):
             grad_op = self.ops[grad_op_idx]
             assert (
                 grad_var.name in grad_op.output_arg_names
-            ), "grad [{}] should be output of {}".format(
-                grad_var.name, str(grad_op)
-            )
+            ), f"grad [{grad_var.name}] should be output of {str(grad_op)}"
             self.coalesce_op_idx = grad_op_idx
 
     def finalize(self):
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 8f9927ae37c28..82475251ee516 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -368,9 +368,7 @@ def cast_block(self, block):
                     for in_var_name in op.input_arg_names:
                         assert (
                             in_var.dtype == block.var(in_var_name).dtype
-                        ), "{}, {}, {}".format(
-                            in_var, block.var(in_var_name), str(op)
-                        )
+                        ), f"{in_var}, {block.var(in_var_name)}, {str(op)}"
                     out_var.desc.set_dtype(in_var.dtype)
 
             idx += num_cast_ops + 1
@@ -479,9 +477,7 @@ def _insert_backward_cast_ops(
             out_var = block.var(out_var_name)
             if _keep_fp32_output(op, out_var.name):
                 continue
-            assert out_var.dtype == dst_dtype, "{}, {}".format(
-                str(out_var), dst_dtype
-            )
+            assert out_var.dtype == dst_dtype, f"{str(out_var)}, {dst_dtype}"
 
         for (
             cast_name,
@@ -495,9 +491,7 @@ def _insert_backward_cast_ops(
             if slot_name in op.input_names:
                 assert src_name in op.input(
                     slot_name
-                ), "var: {} not in op's {}. {}".format(
-                    src_name, slot_name, str(op)
-                )
+                ), f"var: {src_name} not in op's {slot_name}. {str(op)}"
                 src_var_dist_attr = grad_op_attr.get_input_dist_attr(src_name)
                 assert src_var_dist_attr is not None
                 op._rename_input(src_name, cast_name)
@@ -770,9 +764,7 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
         else:
             raise NotImplementedError(
-                "target dtype [{}] is for amp o2 not supported yet.".format(
-                    self.target_dtype
-                )
+                f"target dtype [{self.target_dtype}] is for amp o2 not supported yet."
             )
         global __target_dtype__
         __target_dtype__ = __target_dtype
diff --git a/python/paddle/distributed/passes/auto_parallel_grad_clip.py b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
index 3bee97a90dbc4..327b208518ee8 100644
--- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py
+++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
@@ -280,9 +280,7 @@ def _partition_parameters(self, params):
                 numel = reduce(lambda x, y: x * y, param.shape, 1)
                 assert (
                     numel > 0
-                ), "param [{}] should larger than 0, but it is [{}]".format(
-                    param.name, numel
-                )
+                ), f"param [{param.name}] should larger than 0, but it is [{numel}]"
                 sizes[rank] += numel
         return mapping
 
diff --git a/python/paddle/distributed/passes/auto_parallel_pipeline.py b/python/paddle/distributed/passes/auto_parallel_pipeline.py
index 9e2a06778854a..1b14560078fad 100644
--- a/python/paddle/distributed/passes/auto_parallel_pipeline.py
+++ b/python/paddle/distributed/passes/auto_parallel_pipeline.py
@@ -85,7 +85,7 @@ def _apply_single_impl(self, main_program, startup_program, context):
         else:
             raise ValueError(
                 "Now only 'F-then-B', '1F1B' and 'stream' are supported."
-                "The given value is {}.".format(self._mode)
+                f"The given value is {self._mode}."
             )
 
     def _insert_sync_ops_for_stream(self):
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index 41a4b357c7080..f7b211fdc4ba4 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -804,10 +804,7 @@ def _fuse_overlap_parameter_comm_stage_two(self, sharding_info):
                 )
             )
             _logger.debug(
-                "Bucket[{}] parameters: {}.".format(
-                    i,
-                    [p.name for p in param_group.vars],
-                )
+                f"Bucket[{i}] parameters: {[p.name for p in param_group.vars]}."
             )
 
             broadcast_var_to_group_map[
@@ -1647,9 +1644,7 @@ def partition_by_greedy_even(params, group_size):
         numel = reduce(lambda x, y: x * y, param.shape, 1)
         assert (
             numel > 0
-        ), "param [{}] should larger than 0, but it is [{}]".format(
-            param.name, numel
-        )
+        ), f"param [{param.name}] should larger than 0, but it is [{numel}]"
         sizes[rank] += numel
 
     return mapping
@@ -1664,9 +1659,7 @@ def partition_parameters(params, group_size, algor="greedy_even"):
     _logger.info("Sharding Parameter Partition:")
     for k, v in rank_to_params.items():
         _logger.info(
-            "Rank:{}, Parameter Size:{} MB.".format(
-                k, sum([get_var_size(var) for var in v])
-            )
+            f"Rank:{k}, Parameter Size:{sum([get_var_size(var) for var in v])} MB."
         )
         _logger.info(f"Params in this rank: {[var.name for var in v]}.")
 
diff --git a/python/paddle/distributed/passes/pipeline_scheduler_pass.py b/python/paddle/distributed/passes/pipeline_scheduler_pass.py
index a473e7b095eaf..7e7c69b04fb43 100644
--- a/python/paddle/distributed/passes/pipeline_scheduler_pass.py
+++ b/python/paddle/distributed/passes/pipeline_scheduler_pass.py
@@ -405,9 +405,7 @@ def apply_pass(main_program, startup_program, pass_name, pass_attr={}):
     assert pass_name in [
         "FThenB",
         "1F1B",
-    ], "pipeline scheduler only support FThenB and 1F1B, but recieve {}".format(
-        pass_name
-    )
+    ], f"pipeline scheduler only support FThenB and 1F1B, but recieve {pass_name}"
 
     if pass_name == "1F1B":
         # TODO(Ruibiao): Move FLAGS_1f1b_backward_forward_overlap and
diff --git a/python/paddle/distributed/ps/coordinator.py b/python/paddle/distributed/ps/coordinator.py
index f433b58ae20dd..6926775241546 100755
--- a/python/paddle/distributed/ps/coordinator.py
+++ b/python/paddle/distributed/ps/coordinator.py
@@ -88,9 +88,7 @@ def select(self):
         self.parse_from_string()
         for client_id in self.clients_info:
             logger.info(
-                "fl-ps > client {} info : {}".format(
-                    client_id, self.clients_info[client_id]
-                )
+                f"fl-ps > client {client_id} info : {self.clients_info[client_id]}"
             )
             # ......... to implement ...... #
             fl_strategy_desc = the_one_ps_pb2.FLStrategy()
@@ -253,9 +251,7 @@ def pull_fl_strategy(self):
             self._client_ptr.pull_fl_strategy()
         )  # block: wait for coordinator's strategy arrived
         logger.info(
-            "fl-ps > fl client recved fl_strategy(str):\n{}".format(
-                fl_strategy_str
-            )
+            f"fl-ps > fl client recved fl_strategy(str):\n{fl_strategy_str}"
         )
         fl_strategy_desc = the_one_ps_pb2.FLStrategy()
         text_format.Parse(
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index 968b15817ed7a..a32451985472a 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -73,11 +73,7 @@ def check_embedding_dim(accessor_proto, varname, program_id, context):
     for var in main_program.list_vars():
         if var.name == varname:
             embedding_dim = var.shape[1]
-            print(
-                'new var: {}, {}, {}'.format(
-                    var, embedding_dim, accessor_proto.fea_dim
-                )
-            )
+            print(f'new var: {var}, {embedding_dim}, {accessor_proto.fea_dim}')
             break
 
     fea_dim = accessor_proto.fea_dim
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index 865de4c828308..43a1840584be6 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -651,9 +651,7 @@ def get_the_one_send_context(attrs, split_dense_table=False, ep_list=None):
 def find_heter_ops(program, default_device="cpu"):
     if default_device not in DEVICE_LIST:
         raise ValueError(
-            "Given device {} is not in device list {}".format(
-                default_device, DEVICE_LIST
-            )
+            f"Given device {default_device} is not in device list {DEVICE_LIST}"
         )
 
     def _is_heter_op(op, current_heter_device, default_device="cpu"):
@@ -1153,12 +1151,12 @@ def get_communicate_var_info(
     input_var_reshape_name = []
 
     if type == "forward":
-        block_input_var_name = "forward_joint_{}_{}@Heter".format(
-            block_index - 1, block_index
+        block_input_var_name = (
+            f"forward_joint_{block_index - 1}_{block_index}@Heter"
         )
     else:
-        block_input_var_name = "backward_joint_{}_{}@Heter".format(
-            block_index + 1, block_index
+        block_input_var_name = (
+            f"backward_joint_{block_index + 1}_{block_index}@Heter"
         )
 
     entrance_var_list.sort()
diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py
index eb22fde7bc1e7..b06201dc89472 100644
--- a/python/paddle/distributed/utils/launch_utils.py
+++ b/python/paddle/distributed/utils/launch_utils.py
@@ -35,9 +35,7 @@ def get_cluster_from_args(args, selected_gpus):
     node_rank = node_ips.index(node_ip)
 
     logger.debug(
-        "parsed from args:node_ips:{} node_ip:{} node_rank:{}".format(
-            node_ips, node_ip, node_rank
-        )
+        f"parsed from args:node_ips:{node_ips} node_ip:{node_ip} node_rank:{node_rank}"
     )
 
     free_ports = None
@@ -91,11 +89,9 @@ def get_gpus(selected_gpus):
                 for x in selected_gpus.split(',')
             ]
             logger.info(
-                "Change selected_gpus into reletive values. --ips:{} "
-                "will change into relative_ips:{} according to your "
-                "CUDA_VISIBLE_DEVICES:{}".format(
-                    selected_gpus, gpus, cuda_visible_devices_list
-                )
+                f"Change selected_gpus into reletive values. --ips:{selected_gpus} "
+                f"will change into relative_ips:{gpus} according to your "
+                f"CUDA_VISIBLE_DEVICES:{cuda_visible_devices_list}"
             )
 
     return gpus
@@ -217,9 +213,7 @@ def __init__(self):
         self.rank = None
 
     def __str__(self):
-        return "gpu:{} endpoint:{} rank:{}".format(
-            self.gpus, self.endpoint, self.rank
-        )
+        return f"gpu:{self.gpus} endpoint:{self.endpoint} rank:{self.rank}"
 
     def __eq__(self, t):
         if len(self.gpus) != len(t.gpus):
diff --git a/python/paddle/distributed/utils/nccl_utils.py b/python/paddle/distributed/utils/nccl_utils.py
index 2910bdd0a8d2e..16e445d54bb04 100644
--- a/python/paddle/distributed/utils/nccl_utils.py
+++ b/python/paddle/distributed/utils/nccl_utils.py
@@ -27,9 +27,7 @@ def get_nccl_version_str(ver):
     NCCL_MINOR_VERSION = int(ver // 100)
     NCCL_PATCH_VERSION = int(ver % 100)
 
-    return "{}.{}.{}".format(
-        NCCL_MAJOR_VERSION, NCCL_MINOR_VERSION, NCCL_PATCH_VERSION
-    )
+    return f"{NCCL_MAJOR_VERSION}.{NCCL_MINOR_VERSION}.{NCCL_PATCH_VERSION}"
 
 
 def check_nccl_version_for_p2p():
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index df9cc318830d4..9600f2159abf6 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -54,9 +54,7 @@
 def _check_normalization(norm):
     if norm not in ['forward', 'backward', 'ortho']:
         raise ValueError(
-            "Unexpected norm: {}. Norm should be forward, backward or ortho".format(
-                norm
-            )
+            f"Unexpected norm: {norm}. Norm should be forward, backward or ortho"
         )
 
 
@@ -79,7 +77,7 @@ def _check_fft_shape(x, s):
     if len(s) > ndim:
         raise ValueError(
             "Length of FFT argument s should not be larger than the rank of input. "
-            "Received s: {}, rank of x: {}".format(s, ndim)
+            f"Received s: {s}, rank of x: {ndim}"
         )
     for size in s:
         if not isinstance(size, int) or size <= 0:
@@ -92,9 +90,7 @@ def _check_fft_axis(x, axis):
         raise ValueError(f"Invalid FFT axis ({axis}), it shoule be an integer.")
     if axis < -ndim or axis >= ndim:
         raise ValueError(
-            "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
-                axis, ndim, ndim
-            )
+            f"Invalid FFT axis ({axis}), it should be in range [-{ndim}, {ndim})"
         )
 
 
@@ -102,14 +98,12 @@ def _check_fft_axes(x, axes):
     ndim = x.ndim
     if not isinstance(axes, Sequence):
         raise ValueError(
-            "Invalid FFT axes ({}), it should be a sequence of integers.".format(
-                axes
-            )
+            f"Invalid FFT axes ({axes}), it should be a sequence of integers."
         )
     if len(axes) > ndim:
         raise ValueError(
             "Length of fft axes should not be larger than the rank of input. "
-            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim)
+            f"Received, len of axes: {len(axes)}, rank of x: {ndim}"
         )
     for axis in axes:
         if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
@@ -914,9 +908,7 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                    s
-                )
+                f"Invalid FFT argument s ({s}), it should be a sequence of 2 integers."
             )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
@@ -984,9 +976,7 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                    s
-                )
+                f"Invalid FFT argument s ({s}), it should be a sequence of 2 integers."
             )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
@@ -1048,9 +1038,7 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                    s
-                )
+                f"Invalid FFT argument s ({s}), it should be a sequence of 2 integers."
             )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
@@ -1104,9 +1092,7 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                    s
-                )
+                f"Invalid FFT argument s ({s}), it should be a sequence of 2 integers."
             )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
@@ -1153,9 +1139,7 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                    s
-                )
+                f"Invalid FFT argument s ({s}), it should be a sequence of 2 integers."
             )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
@@ -1216,9 +1200,7 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
-                    s
-                )
+                f"Invalid FFT argument s ({s}), it should be a sequence of 2 integers."
             )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index b1401759c1aad..68971d85653d3 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -181,9 +181,9 @@ def _build_load_path_and_config(path, config):
     directory_format_exist = os.path.isdir(path)
     if prefix_format_exist and directory_format_exist:
         raise ValueError(
-            "The {}.pdmodel and {} directory exist at the same time, "
+            f"The {path}.pdmodel and {path} directory exist at the same time, "
             "don't know which one to load, please make sure that the specified target "
-            "of ``path`` is unique.".format(path, path)
+            "of ``path`` is unique."
         )
     elif not prefix_format_exist and not directory_format_exist:
         error_msg = "The ``path`` (%s) to load model not exists."
@@ -281,9 +281,7 @@ def _pickle_save(obj, f, protocol):
     # TODO(weixin):add support for BytesIO.
     if not isinstance(protocol, int):
         raise ValueError(
-            "The 'protocol' MUST be `int`, but received {}".format(
-                type(protocol)
-            )
+            f"The 'protocol' MUST be `int`, but received {type(protocol)}"
         )
 
     if protocol < 2 or protocol > 4:
@@ -429,9 +427,7 @@ def _transformed_from_lodtensor(obj):
 def _to_LodTensor(ndarray):
     if not isinstance(ndarray, np.ndarray):
         raise TypeError(
-            'Type of `ndarray` should be numpy.ndarray, but received {}.'.format(
-                type(ndarray)
-            )
+            f'Type of `ndarray` should be numpy.ndarray, but received {type(ndarray)}.'
         )
     t = core.LoDTensor()
     place = _current_expected_place()
@@ -794,9 +790,7 @@ def save(obj, path, protocol=4, **configs):
             os.makedirs(dirname, exist_ok=True)
     elif not _is_memory_buffer(path):
         raise ValueError(
-            "only supports saving objects to file and `BytesIO`, but got {}".format(
-                type(path)
-            )
+            f"only supports saving objects to file and `BytesIO`, but got {type(path)}"
         )
 
     config = _parse_save_config(configs)
@@ -846,9 +840,7 @@ def _legacy_save(obj, path, protocol=2):
 
     if not isinstance(protocol, int):
         raise ValueError(
-            "The 'protocol' MUST be `int`, but received {}".format(
-                type(protocol)
-            )
+            f"The 'protocol' MUST be `int`, but received {type(protocol)}"
         )
 
     if protocol < 2 or protocol > 4:
@@ -1129,9 +1121,7 @@ def load(path, **configs):
                             return program
                     except:
                         raise ValueError(
-                            "`paddle.load` can not parse the file:{}.".format(
-                                path
-                            )
+                            f"`paddle.load` can not parse the file:{path}."
                         )
 
     else:
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 8d72d0215e93c..ef1260302d658 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -107,9 +107,7 @@ def get_rng_state(device=None):
             )
     else:
         raise ValueError(
-            "get_rng_state is not implemented for current device: {}".format(
-                place
-            )
+            f"get_rng_state is not implemented for current device: {place}"
         )
 
     return state_list
@@ -203,9 +201,7 @@ def set_rng_state(state_list, device=None):
         core.default_cpu_generator().set_state(state_list[0])
     else:
         raise ValueError(
-            "set_rng_state is not implemented for current device: {}".format(
-                place
-            )
+            f"set_rng_state is not implemented for current device: {place}"
         )
 
 
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 0f56648841496..1de70114db7e4 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -1300,9 +1300,7 @@ def on_eval_end(self, logs=None):
                 lr = self.model._optimizer._learning_rate
                 if not isinstance(lr, float):
                     warnings.warn(
-                        'Expected learning_rate be float, bug got {}.'.format(
-                            type(lr)
-                        )
+                        f'Expected learning_rate be float, bug got {type(lr)}.'
                     )
                     return
             except Exception as e:
diff --git a/python/paddle/hapi/dynamic_flops.py b/python/paddle/hapi/dynamic_flops.py
index bbd94bd8da8f1..fcae6e4120ac8 100644
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
@@ -312,8 +312,6 @@ def add_hooks(m):
     if print_detail:
         table.print_table()
     print(
-        'Total Flops: {}     Total Params: {}'.format(
-            int(total_ops), int(total_params)
-        )
+        f'Total Flops: {int(total_ops)}     Total Params: {int(total_params)}'
     )
     return int(total_ops)
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index ca4502e7a19cc..a9118eb1c6cd0 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -53,8 +53,8 @@ def _import_module(name, repo_dir):
 
 def _git_archive_link(repo_owner, repo_name, branch, source):
     if source == 'github':
-        return 'https://github.com/{}/{}/archive/{}.zip'.format(
-            repo_owner, repo_name, branch
+        return (
+            f'https://github.com/{repo_owner}/{repo_name}/archive/{branch}.zip'
         )
     elif source == 'gitee':
         return 'https://gitee.com/{}/{}/repository/archive/{}.zip'.format(
@@ -199,9 +199,7 @@ def list(repo_dir, source='github', force_reload=False):
     """
     if source not in ('github', 'gitee', 'local'):
         raise ValueError(
-            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.format(
-                source
-            )
+            f'Unknown source: "{source}". Allowed values: "github" | "gitee" | "local".'
         )
 
     if source in ('github', 'gitee'):
@@ -248,9 +246,7 @@ def help(repo_dir, model, source='github', force_reload=False):
     """
     if source not in ('github', 'gitee', 'local'):
         raise ValueError(
-            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.format(
-                source
-            )
+            f'Unknown source: "{source}". Allowed values: "github" | "gitee" | "local".'
         )
 
     if source in ('github', 'gitee'):
@@ -293,9 +289,7 @@ def load(repo_dir, model, source='github', force_reload=False, **kwargs):
     """
     if source not in ('github', 'gitee', 'local'):
         raise ValueError(
-            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.format(
-                source
-            )
+            f'Unknown source: "{source}". Allowed values: "github" | "gitee" | "local".'
         )
 
     if source in ('github', 'gitee'):
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 267e938a6e298..df5791a5fd70d 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -256,10 +256,8 @@ def summary(net, input_size=None, dtypes=None, input=None):
                 item = (item,)
             assert isinstance(
                 item, (tuple, InputSpec)
-            ), 'When input_size is list, \
-            expect item in input_size is a tuple or InputSpec, but got {}'.format(
-                type(item)
-            )
+            ), f'When input_size is list, \
+            expect item in input_size is a tuple or InputSpec, but got {type(item)}'
 
             if isinstance(item, InputSpec):
                 _input_size.append(tuple(item.shape))
diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index c5b25a58c12e1..bb419e9b2a1ef 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -96,9 +96,9 @@ def convert_uint16_to_float(in_list):
         if time_per_unit >= 1 or time_per_unit == 0:
             fps = f' - {time_per_unit:.0f}s/{self.name}'
         elif time_per_unit >= 1e-3:
-            fps = ' - {:.0f}ms/{}'.format(time_per_unit * 1e3, self.name)
+            fps = f' - {time_per_unit * 1e3:.0f}ms/{self.name}'
         else:
-            fps = ' - {:.0f}us/{}'.format(time_per_unit * 1e6, self.name)
+            fps = f' - {time_per_unit * 1e6:.0f}us/{self.name}'
 
         info = ''
         if self._verbose == 1:
diff --git a/python/paddle/incubate/asp/asp.py b/python/paddle/incubate/asp/asp.py
index 671bc0251c1e8..041132047dc71 100644
--- a/python/paddle/incubate/asp/asp.py
+++ b/python/paddle/incubate/asp/asp.py
@@ -1002,9 +1002,9 @@ def set_state_dict(self, state_dict):
         )
         for param_name, var in asp_info.mask_vars.items():
             param_mask_name = ASPHelper._get_mask_name(param_name)
-            assert param_mask_name in state_dict, "The {} is not found.".format(
-                param_mask_name
-            )
+            assert (
+                param_mask_name in state_dict
+            ), f"The {param_mask_name} is not found."
             var.set_value(state_dict[param_mask_name])
             asp_info.update_masks(param_name, var.numpy())
         return self._optimizer.set_state_dict(state_dict)
diff --git a/python/paddle/incubate/asp/supported_layer_list.py b/python/paddle/incubate/asp/supported_layer_list.py
index b0d420fa36b03..0ebc6ea2d3128 100644
--- a/python/paddle/incubate/asp/supported_layer_list.py
+++ b/python/paddle/incubate/asp/supported_layer_list.py
@@ -105,11 +105,7 @@ def add_supported_layer(layer, pruning_func=None):
     elif issubclass(layer, paddle.nn.Layer):
         name = paddle.nn.layer.layers._convert_camel_to_snake(layer.__name__)
     else:
-        assert (
-            "The type of layer should be string of Layer, but got {}!".format(
-                type(layer)
-            )
-        )
+        assert f"The type of layer should be string of Layer, but got {type(layer)}!"
     if pruning_func is None:
         pruning_func = _default_pruning
     _supported_layers_and_prune_func_map_lock.acquire()
diff --git a/python/paddle/incubate/asp/utils.py b/python/paddle/incubate/asp/utils.py
index 1028dcc4dae29..4ed8d7e74d56e 100644
--- a/python/paddle/incubate/asp/utils.py
+++ b/python/paddle/incubate/asp/utils.py
@@ -536,7 +536,7 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
 
     assert isinstance(func_name, MaskAlgo), (
         "func_name argumet of create_mask is only accepted as type MaskAlgo. "
-        "But got {}".format(type(func_name))
+        f"But got {type(func_name)}"
     )
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
@@ -559,7 +559,7 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
     else:
         raise ValueError(
             "The dimension of input tensor is not supported in create_mask, "
-            "Only dimension < 4 is supported but got {}".format(len(shape))
+            f"Only dimension < 4 is supported but got {len(shape)}"
         )
 
     mask = func(t, n=n, m=m)
@@ -606,7 +606,7 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
 
     assert type(func_name) == CheckMethod, (
         "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
-        "But got {}".format(type(func_name))
+        f"But got {type(func_name)}"
     )
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
@@ -623,7 +623,7 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
     else:
         raise ValueError(
             "The dimension of input tensor is not supported in create_mask, "
-            "Only dimension < 4 is supported but got {}".format(len(shape))
+            f"Only dimension < 4 is supported but got {len(shape)}"
         )
 
     return func(t, n=n, m=m)
diff --git a/python/paddle/incubate/distributed/fleet/fleet_util.py b/python/paddle/incubate/distributed/fleet/fleet_util.py
index 4b232ea8d2c87..afa26ab672314 100644
--- a/python/paddle/incubate/distributed/fleet/fleet_util.py
+++ b/python/paddle/incubate/distributed/fleet/fleet_util.py
@@ -480,8 +480,8 @@ def write_model_donefile(
                     )
                 else:
                     self.rank0_error(
-                        "not write {} because {}/{} already "
-                        "exists".format(donefile_name, day, pass_id)
+                        f"not write {donefile_name} because {day}/{pass_id} already "
+                        "exists"
                     )
             else:
                 with open(donefile_name, "w") as f:
@@ -598,8 +598,8 @@ def write_xbox_donefile(
                     )
                 else:
                     self.rank0_error(
-                        "not write {} because {}/{} already "
-                        "exists".format(donefile_name, day, pass_id)
+                        f"not write {donefile_name} because {day}/{pass_id} already "
+                        "exists"
                     )
             else:
                 with open(donefile_name, "w") as f:
@@ -1021,11 +1021,7 @@ def save_paddle_inference_model(
             if pass_id == "-1":
                 dest = f"{output_path}/{day}/base/dnn_plugin/"
             else:
-                dest = "{}/{}/delta-{}/dnn_plugin/".format(
-                    output_path,
-                    day,
-                    pass_id,
-                )
+                dest = f"{output_path}/{day}/delta-{pass_id}/dnn_plugin/"
             if not client.is_exist(dest):
                 client.makedirs(dest)
 
@@ -1130,11 +1126,7 @@ def save_paddle_params(
             if pass_id == "-1":
                 dest = f"{output_path}/{day}/base/dnn_plugin/"
             else:
-                dest = "{}/{}/delta-{}/dnn_plugin/".format(
-                    output_path,
-                    day,
-                    pass_id,
-                )
+                dest = f"{output_path}/{day}/delta-{pass_id}/dnn_plugin/"
             if not client.is_exist(dest):
                 client.mkdirs(dest)
             client.upload(model_name, dest, multi_processes=5, overwrite=True)
@@ -2048,8 +2040,8 @@ def write_model_donefile(
                     )
                 else:
                     self.rank0_error(
-                        "not write {} because {}/{} already "
-                        "exists".format(donefile_name, day, pass_id)
+                        f"not write {donefile_name} because {day}/{pass_id} already "
+                        "exists"
                     )
             else:
                 with open(donefile_name, "w") as f:
@@ -2165,8 +2157,8 @@ def write_xbox_donefile(
                     )
                 else:
                     self.rank0_info(
-                        "not write {} because {}/{} already "
-                        "exists".format(donefile_name, day, pass_id)
+                        f"not write {donefile_name} because {day}/{pass_id} already "
+                        "exists"
                     )
             else:
                 with open(donefile_name, "w") as f:
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
index b7158f41e9892..ddd795d7f5553 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
@@ -212,9 +212,7 @@ def set_program_config(self, config):
                     setattr(self._program_config, key, config[key])
                 else:
                     raise ValueError(
-                        "DistributeTranspilerConfig doesn't have key: {}".format(
-                            key
-                        )
+                        f"DistributeTranspilerConfig doesn't have key: {key}"
                     )
         else:
             raise TypeError(
diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
index 9d9d35864063d..41d4e18ad577f 100644
--- a/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/incubate/distributed/fleet/parameter_server/ir/trainer_pass.py
@@ -713,9 +713,7 @@ def delete_extra_optimizes_pass(program, config):
 def find_heter_ops(program, default_device="cpu"):
     if default_device not in DEVICE_LIST:
         raise ValueError(
-            "Given device {} is not in device list {}".format(
-                default_device, DEVICE_LIST
-            )
+            f"Given device {default_device} is not in device list {DEVICE_LIST}"
         )
 
     def _is_heter_op(op, current_heter_device, default_device="cpu"):
@@ -1461,12 +1459,12 @@ def get_communicate_var_info(
     input_var_reshape_name = []
 
     if type == "forward":
-        block_input_var_name = "forward_joint_{}_{}@Heter".format(
-            block_index - 1, block_index
+        block_input_var_name = (
+            f"forward_joint_{block_index - 1}_{block_index}@Heter"
         )
     else:
-        block_input_var_name = "backward_joint_{}_{}@Heter".format(
-            block_index + 1, block_index
+        block_input_var_name = (
+            f"backward_joint_{block_index + 1}_{block_index}@Heter"
         )
 
     entrance_var_list.sort()
diff --git a/python/paddle/incubate/distributed/fleet/utils.py b/python/paddle/incubate/distributed/fleet/utils.py
index 2be2ac7161071..17d0a4e35e693 100644
--- a/python/paddle/incubate/distributed/fleet/utils.py
+++ b/python/paddle/incubate/distributed/fleet/utils.py
@@ -398,9 +398,7 @@ def try_load_model_vars(
             )
         else:
             logger.info(
-                "load feed vars from files: {}.".format(
-                    feed_config.feeded_vars_filelist
-                )
+                f"load feed vars from files: {feed_config.feeded_vars_filelist}."
             )
             feed_vars = [
                 inference_program.global_block().var(
@@ -455,9 +453,7 @@ def check_saved_vars_try_dump(
         v for v in dump_prog.list_vars() if io_utils.is_persistable(v)
     ]
     logger.info(
-        "persistable vars in dump program: {}".format(
-            [v.name for v in saved_params]
-        )
+        f"persistable vars in dump program: {[v.name for v in saved_params]}"
     )
 
     check_not_expected_ops(dump_prog)
diff --git a/python/paddle/incubate/nn/layer/fused_dropout_nd.py b/python/paddle/incubate/nn/layer/fused_dropout_nd.py
index a820654fa9efc..ded171158fe3d 100644
--- a/python/paddle/incubate/nn/layer/fused_dropout_nd.py
+++ b/python/paddle/incubate/nn/layer/fused_dropout_nd.py
@@ -137,6 +137,4 @@ def forward(self, input):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'p={}, axis={}, mode={}{}'.format(
-            self.p, self.axis, self.mode, name_str
-        )
+        return f'p={self.p}, axis={self.axis}, mode={self.mode}{name_str}'
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index a09cc521a0e79..1626403e26b5a 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -125,7 +125,7 @@ def __init__(
         super().__init__()
         assert embed_dim > 0, (
             "Expected embed_dim to be greater than 0, "
-            "but received {}".format(embed_dim)
+            f"but received {embed_dim}"
         )
         self._dtype = self._helper.get_default_dtype()
         self._bias_attr = bias_attr
@@ -303,12 +303,10 @@ def __init__(
 
         assert embed_dim > 0, (
             "Expected embed_dim to be greater than 0, "
-            "but received {}".format(embed_dim)
+            f"but received {embed_dim}"
         )
-        assert (
-            num_heads > 0
-        ), "Expected nhead to be greater than 0, " "but received {}".format(
-            num_heads
+        assert num_heads > 0, (
+            "Expected nhead to be greater than 0, " f"but received {num_heads}"
         )
 
         self.normalize_before = normalize_before
@@ -587,9 +585,7 @@ def __init__(
         super().__init__()
         assert (
             d_model > 0
-        ), "Expected d_model to be greater than 0, but received {}".format(
-            d_model
-        )
+        ), f"Expected d_model to be greater than 0, but received {d_model}"
         assert (
             dim_feedforward > 0
         ), "Expected dim_feedforward to be greater than 0, but received {}".format(
@@ -809,19 +805,15 @@ def __init__(
         self._config.pop("__class__", None)  # py3
 
         super().__init__()
-        assert (
-            d_model > 0
-        ), "Expected d_model to be greater than 0, " "but received {}".format(
-            d_model
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, " f"but received {d_model}"
         )
-        assert (
-            nhead > 0
-        ), "Expected nhead to be greater than 0, " "but received {}".format(
-            nhead
+        assert nhead > 0, (
+            "Expected nhead to be greater than 0, " f"but received {nhead}"
         )
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but received {}".format(dim_feedforward)
+            f"but received {dim_feedforward}"
         )
         attn_dropout_rate = (
             dropout_rate if attn_dropout_rate is None else attn_dropout_rate
@@ -1225,12 +1217,10 @@ def __init__(
 
         assert embed_dim > 0, (
             "Expected embed_dim to be greater than 0, "
-            "but received {}".format(embed_dim)
+            f"but received {embed_dim}"
         )
-        assert (
-            num_heads > 0
-        ), "Expected nhead to be greater than 0, " "but received {}".format(
-            num_heads
+        assert num_heads > 0, (
+            "Expected nhead to be greater than 0, " f"but received {num_heads}"
         )
         assert (
             dim_feedforward > 0
diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py
index bc244d9c19da1..93539d4c0c3a8 100644
--- a/python/paddle/incubate/optimizer/functional/bfgs.py
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
@@ -125,9 +125,7 @@ def minimize_bfgs(
 
     if dtype not in ['float32', 'float64']:
         raise ValueError(
-            "The dtype must be 'float32' or 'float64', but the specified is {}.".format(
-                dtype
-            )
+            f"The dtype must be 'float32' or 'float64', but the specified is {dtype}."
         )
 
     op_name = 'minimize_bfgs'
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
index fc482e4ca18b5..810f28b740145 100644
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -126,9 +126,7 @@ def minimize_lbfgs(
     """
     if dtype not in ['float32', 'float64']:
         raise ValueError(
-            "The dtype must be 'float32' or 'float64', but the specified is {}.".format(
-                dtype
-            )
+            f"The dtype must be 'float32' or 'float64', but the specified is {dtype}."
         )
 
     op_name = 'minimize_lbfgs'
diff --git a/python/paddle/incubate/optimizer/gradient_merge.py b/python/paddle/incubate/optimizer/gradient_merge.py
index c449f7405bbaf..022e4dc8fbb7b 100644
--- a/python/paddle/incubate/optimizer/gradient_merge.py
+++ b/python/paddle/incubate/optimizer/gradient_merge.py
@@ -154,9 +154,7 @@ def _remove_op_role_var(self, param, grad):
         op = grad.op
         assert self._is_the_backward_op(
             op
-        ), 'grad.op={} is not the backward op which produces the grad={}'.format(
-            op, grad.name
-        )
+        ), f'grad.op={op} is not the backward op which produces the grad={grad.name}'
 
         block = grad.block
         var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py
index d3ee9f261b2f8..6c0e80b1f5710 100644
--- a/python/paddle/incubate/optimizer/pipeline.py
+++ b/python/paddle/incubate/optimizer/pipeline.py
@@ -105,9 +105,7 @@ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
             raise ValueError(
                 "The 'optimizer' parameter for "
                 "PipelineOptimizer must be an instance of "
-                "{}, but the given type is {}.".format(
-                    valid_optimizers, type(optimizer)
-                )
+                f"{valid_optimizers}, but the given type is {type(optimizer)}."
             )
         self._optimizer = optimizer
 
@@ -511,9 +509,7 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             post_op = self._find_post_op(idx, out_name)
             assert post_op.has_attr(
                 'op_device'
-            ), "{} has no op_device attr for var {}".format(
-                post_op.type, out_name
-            )
+            ), f"{post_op.type} has no op_device attr for var {out_name}"
             device = post_op.attr(self._op_device_key)
             assert device, "The post op must have op_device set."
             op._set_attr(self._op_device_key, device)
@@ -605,8 +601,8 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             ]
             assert op.type in other_known_ops, (
                 "For other ops without "
-                "op_device set, they must be one of {}, but it "
-                "is {}".format(other_known_ops, op.type)
+                f"op_device set, they must be one of {other_known_ops}, but it "
+                f"is {op.type}"
             )
             assert self._is_optimize_op(op)
             op._set_attr(self._op_device_key, f"{self._device}:all")
@@ -670,15 +666,11 @@ def _check_validation(self, block):
 
             assert op.has_attr(
                 self._op_device_key
-            ), "op ({}) has no {} attribute.".format(
-                op.type, self._op_device_key
-            )
+            ), f"op ({op.type}) has no {self._op_device_key} attribute."
 
             device = op.attr(self._op_device_key)
-            assert (
-                device
-            ), "op_device attribute for op " "{} has not been set.".format(
-                op.type
+            assert device, (
+                "op_device attribute for op " f"{op.type} has not been set."
             )
             if device == f"{self._device}:all":
                 continue
@@ -982,7 +974,7 @@ def _insert_send_recv(cur_id, prev_id):
                     else:
                         raise ValueError(
                             "Now only 'F-then-B' and '1F1B' are supported."
-                            "The given value is {}.".format(self.schedule_mode)
+                            f"The given value is {self.schedule_mode}."
                         )
 
                 _insert_send_recv(
@@ -1001,7 +993,7 @@ def _insert_loss_scale(self, block):
             if self._is_loss_grad_op(op):
                 assert op.type == 'fill_constant', (
                     "loss_grad_op must be fill_constant op, "
-                    "but this op is {}".format(op.type)
+                    f"but this op is {op.type}"
                 )
                 assert op.has_attr('value')
                 loss_scale = float(op.attr('value'))
@@ -1580,8 +1572,8 @@ def _process_persistable_vars_in_multi_sections(
                         continue
                     if var_name in op.desc.output_arg_names():
                         assert var_name not in write_info, (
-                            "two sections write the same var({}): second "
-                            "op {}.".format(var_name, op)
+                            f"two sections write the same var({var_name}): second "
+                            f"op {op}."
                         )
                         write_info[var_name] = prog
                         break
@@ -1820,7 +1812,7 @@ def _check_pipeline_persist_var(self, program):
             "However, some backward op don't need this var(NoNeedBufferVars), "
             "there will be no error at this time.\n"
             "So please check these persistable vars which changed in "
-            "forward and used in backward:\n{}".format(used_in_backward)
+            f"forward and used in backward:\n{used_in_backward}"
         )
 
     def minimize(
diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py
index 8e16bf27a1b53..9cbd8894f1889 100644
--- a/python/paddle/incubate/optimizer/recompute.py
+++ b/python/paddle/incubate/optimizer/recompute.py
@@ -291,9 +291,7 @@ def _insert_async_memcpy_op(
     def _insert_fetch_op(self, idx, varname):
         assert (
             varname in self.checkpoint_name2pinned_name
-        ), "Try to fetch {} from Pinned Memory, but it is NOT a checkpoint".format(
-            varname
-        )
+        ), f"Try to fetch {varname} from Pinned Memory, but it is NOT a checkpoint"
 
         pinned_varname = self.checkpoint_name2pinned_name[varname]
         fetch_varname = self.checkpoint_name2fetch_name[varname]
@@ -302,9 +300,7 @@ def _insert_fetch_op(self, idx, varname):
     def _insert_offload_op(self, idx, varname):
         assert (
             varname in self.checkpoint_name2pinned_name
-        ), "Try to offload {} to Pinned Memory, but it is NOT a checkpoint".format(
-            varname
-        )
+        ), f"Try to offload {varname} to Pinned Memory, but it is NOT a checkpoint"
         pinned_varname = self.checkpoint_name2pinned_name[varname]
         self._insert_async_memcpy_op(idx, varname, pinned_varname, 0, 2)
 
@@ -399,16 +395,12 @@ def _parse_backward(self):
                         self.checkpoint_usage_count[input_var] += 1
                     else:
                         raise ValueError(
-                            "use checkpoint [{}] before fetch in BW".format(
-                                input_var
-                            )
+                            f"use checkpoint [{input_var}] before fetch in BW"
                         )
 
         assert (
             len(self.un_fetch_checkpoint_names) == 0
-        ), "{} checkpoints have NOT been Recorded".format(
-            self.un_fetch_checkpoint_names
-        )
+        ), f"{self.un_fetch_checkpoint_names} checkpoints have NOT been Recorded"
 
     def _update_backward(self):
         if len(self.idx2insertions) == 0:
@@ -551,9 +543,7 @@ def _parse_forward(self):
 
         assert (
             len(self.un_offload_checkpoint_names) == 0
-        ), "{} checkpoints have NOT been Recorded".format(
-            self.un_fetch_checkpoint_names
-        )
+        ), f"{self.un_fetch_checkpoint_names} checkpoints have NOT been Recorded"
         assert len(self.synced_checkpoints) == len(
             need_offload_checkpoint_names
         ), "{} checkpoints have NOT been Recorded".format(
diff --git a/python/paddle/incubate/passes/ir.py b/python/paddle/incubate/passes/ir.py
index 8b9d9944d9bee..1b960855ef46c 100644
--- a/python/paddle/incubate/passes/ir.py
+++ b/python/paddle/incubate/passes/ir.py
@@ -287,9 +287,7 @@ def mapped_op(pattern_ops):
                 ops = [o for o in pattern_ops if o._type == op]
                 if len(ops) <= index:
                     raise ValueError(
-                        "Index '{}' of operator '{}' is incorrect.".format(
-                            index, op
-                        )
+                        f"Index '{index}' of operator '{op}' is incorrect."
                     )
                 return PassDesc.AttrHelper(
                     ops[index], name, element_index=element_index
@@ -336,12 +334,7 @@ def _to_readable_code(self, skip_op_callstack=True):
             attrs_str += ", ".join([f"{k}={v}" for k, v in self._attrs.items()])
             attrs_str += "}"
 
-            op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".format(
-                outputs=outputs_str,
-                op_type=self._type,
-                inputs=inputs_str,
-                attrs=attrs_str,
-            )
+            op_str = f"{outputs_str} = {self._type}(inputs={inputs_str}, {attrs_str})"
             return op_str
 
         def __init__(self, type=None):
@@ -400,9 +393,7 @@ def Init(self):
             self._proto = OpProtoHolder.instance().op_proto_map.get(self._type)
             if self._proto is None:
                 raise AttributeError(
-                    "type object 'OpHelper' has no attribute '{}'".format(
-                        self._type
-                    )
+                    f"type object 'OpHelper' has no attribute '{self._type}'"
                 )
             self._index = len(block.ops)
             self._desc = block.desc.append_op()
@@ -429,9 +420,7 @@ def Output(self, name):
             output = self._outputs.get(name)
             if output is None:
                 raise ValueError(
-                    "Operator '{}' does not have output named '{}'.".format(
-                        self._type, name
-                    )
+                    f"Operator '{self._type}' does not have output named '{name}'."
                 )
             return output
 
diff --git a/python/paddle/io/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py
index 78c93151a390d..20a9bb9a00da4 100644
--- a/python/paddle/io/dataloader/batch_sampler.py
+++ b/python/paddle/io/dataloader/batch_sampler.py
@@ -112,9 +112,7 @@ def __init__(
             ), "either dataset or sampler should be set"
             assert isinstance(
                 sampler, Sampler
-            ), "sampler should be a paddle.io.Sampler, but got {}".format(
-                type(sampler)
-            )
+            ), f"sampler should be a paddle.io.Sampler, but got {type(sampler)}"
             assert not shuffle, "shuffle should be False when sampler is set"
             self.sampler = sampler
         else:
@@ -124,9 +122,7 @@ def __init__(
             assert sampler is None, "should not set both dataset and sampler"
             assert isinstance(
                 shuffle, bool
-            ), "shuffle should be a boolean value, but got {}".format(
-                type(shuffle)
-            )
+            ), f"shuffle should be a boolean value, but got {type(shuffle)}"
             if shuffle:
                 self.sampler = RandomSampler(dataset)
             else:
@@ -134,15 +130,11 @@ def __init__(
 
         assert (
             isinstance(batch_size, int) and batch_size > 0
-        ), "batch_size should be a positive integer, but got {}".format(
-            batch_size
-        )
+        ), f"batch_size should be a positive integer, but got {batch_size}"
         self.batch_size = batch_size
         assert isinstance(
             drop_last, bool
-        ), "drop_last should be a boolean value, but got {}".format(
-            type(drop_last)
-        )
+        ), f"drop_last should be a boolean value, but got {type(drop_last)}"
         self.drop_last = drop_last
 
     def __iter__(self):
diff --git a/python/paddle/io/dataloader/collate.py b/python/paddle/io/dataloader/collate.py
index 141624668f09b..cf3d3be5e847f 100644
--- a/python/paddle/io/dataloader/collate.py
+++ b/python/paddle/io/dataloader/collate.py
@@ -76,7 +76,7 @@ def default_collate_fn(batch):
 
     raise TypeError(
         "batch data con only contains: tensor, numpy.ndarray, "
-        "dict, list, number, but got {}".format(type(sample))
+        f"dict, list, number, but got {type(sample)}"
     )
 
 
diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py
index 58a5c07139af5..ed64f5da3d9e9 100644
--- a/python/paddle/io/dataloader/dataloader_iter.py
+++ b/python/paddle/io/dataloader/dataloader_iter.py
@@ -362,10 +362,9 @@ def __init__(self, loader):
         self._persistent_workers = loader._persistent_workers
         self._resume_worker_cnt = 0
 
-        assert (
-            self._num_workers > 0
-        ), "Multi-process DataLoader " "invalid num_workers({})".format(
-            self._num_workers
+        assert self._num_workers > 0, (
+            "Multi-process DataLoader "
+            f"invalid num_workers({self._num_workers})"
         )
 
         # subprocess wrokers' result queue
@@ -706,8 +705,8 @@ def _get_data(self):
                     self._exit_thread_unexpectedly()
                     pids = ', '.join(str(w.pid) for w in failed_workers)
                     raise RuntimeError(
-                        "DataLoader {} workers exit unexpectedly, "
-                        "pids: {}".format(len(failed_workers), pids)
+                        f"DataLoader {len(failed_workers)} workers exit unexpectedly, "
+                        f"pids: {pids}"
                     )
 
                 # get(timeout) will call _poll(timeout) and may raise IOError
@@ -717,8 +716,8 @@ def _get_data(self):
 
                 self._exit_thread_unexpectedly()
                 logging.error(
-                    "DataLoader reader thread failed({}) to read data from "
-                    "workers' result queue.".format(e)
+                    f"DataLoader reader thread failed({e}) to read data from "
+                    "workers' result queue."
                 )
                 raise e
             else:
diff --git a/python/paddle/io/dataloader/flat.py b/python/paddle/io/dataloader/flat.py
index 87c35e6dedd38..36b899e3f55c2 100644
--- a/python/paddle/io/dataloader/flat.py
+++ b/python/paddle/io/dataloader/flat.py
@@ -143,8 +143,8 @@ def _restore(structure, field_idx):
 
     # sample only contains single fields
     if isinstance(structure, (str, bytes)):
-        assert structure == '{}{}'.format(
-            FIELD_PREFIX, 0
+        assert (
+            structure == f'{FIELD_PREFIX}{0}'
         ), f"invalid structure: {structure}"
         return flat_batch[0]
     field_idx = _restore(structure, 0)
diff --git a/python/paddle/io/dataloader/sampler.py b/python/paddle/io/dataloader/sampler.py
index d26316ecc0eb7..44bc545f777cd 100644
--- a/python/paddle/io/dataloader/sampler.py
+++ b/python/paddle/io/dataloader/sampler.py
@@ -208,7 +208,7 @@ def __init__(
         if not isinstance(self.replacement, bool):
             raise TypeError(
                 "expect boolean value for replacement, but got "
-                "replacement={}".format(self.replacement)
+                f"replacement={self.replacement}"
             )
 
         if self._num_samples is not None and not replacement:
@@ -219,7 +219,7 @@ def __init__(
         if not isinstance(self.num_samples, int) or self.num_samples <= 0:
             raise ValueError(
                 "num_samples should be a positive integer, "
-                "but got num_samples={}".format(self.num_samples)
+                f"but got num_samples={self.num_samples}"
             )
 
     @property
diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py
index 814dc667a7cf3..8829b6ee13d5c 100644
--- a/python/paddle/io/dataloader/worker.py
+++ b/python/paddle/io/dataloader/worker.py
@@ -166,9 +166,7 @@ def __init__(self, **kwargs):
     def __setattr__(self, key, val):
         if self.__initialized:
             raise RuntimeError(
-                "Cannot assign attributes to {} objects".format(
-                    self.__class__.__name__
-                )
+                f"Cannot assign attributes to {self.__class__.__name__} objects"
             )
         return super().__setattr__(key, val)
 
diff --git a/python/paddle/io/reader.py b/python/paddle/io/reader.py
index ca9a9eabecf87..fc8446b4f4e42 100644
--- a/python/paddle/io/reader.py
+++ b/python/paddle/io/reader.py
@@ -446,9 +446,7 @@ def __init__(
             self.dataset_kind = _DatasetKind.ITER
             if shuffle:
                 raise ValueError(
-                    "IterableDataset not support shuffle, but got shuffle={}".format(
-                        shuffle
-                    )
+                    f"IterableDataset not support shuffle, but got shuffle={shuffle}"
                 )
             if batch_sampler is not None:
                 raise ValueError(
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index 5799f740d147f..a0ca693ccfcd0 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -661,9 +661,9 @@ def _build_load_path_and_config(path, config):
     directory_format_exist = os.path.isdir(path)
     if prefix_format_exist and directory_format_exist:
         raise ValueError(
-            "The {}.pdmodel and {} directory exist at the same time, "
+            f"The {path}.pdmodel and {path} directory exist at the same time, "
             "don't know which one to load, please make sure that the specified target "
-            "of ``path`` is unique.".format(path, path)
+            "of ``path`` is unique."
         )
     elif not prefix_format_exist and not directory_format_exist:
         raise ValueError(
diff --git a/python/paddle/jit/dy2static/base_transformer.py b/python/paddle/jit/dy2static/base_transformer.py
index e3e2dc7b39941..7d61b3a7417b7 100644
--- a/python/paddle/jit/dy2static/base_transformer.py
+++ b/python/paddle/jit/dy2static/base_transformer.py
@@ -390,8 +390,8 @@ def _build_index_init_node(self):
             index_init_value_str = '0'
             index_init_var_name = self.iter_idx_name
 
-        index_init_node_source_str = "{target} = {value}".format(
-            target=index_init_var_name, value=index_init_value_str
+        index_init_node_source_str = (
+            f"{index_init_var_name} = {index_init_value_str}"
         )
 
         index_init_node = gast.parse(index_init_node_source_str).body[0]
@@ -456,9 +456,7 @@ def _build_enum_init_node(self):
         else:
             init_value_str = '0'
 
-        enum_init_node_source_str = "{} = {}".format(
-            self.enum_idx_name, init_value_str
-        )
+        enum_init_node_source_str = f"{self.enum_idx_name} = {init_value_str}"
         enum_init_node = gast.parse(enum_init_node_source_str).body[0]
         return enum_init_node
 
diff --git a/python/paddle/jit/dy2static/basic_api_transformer.py b/python/paddle/jit/dy2static/basic_api_transformer.py
index 40c5a5f511bde..64dfa67b6cfe6 100644
--- a/python/paddle/jit/dy2static/basic_api_transformer.py
+++ b/python/paddle/jit/dy2static/basic_api_transformer.py
@@ -208,9 +208,7 @@ def visit_Attribute(self, node):
             value = node.value
             node = (
                 gast.parse(
-                    "_jst.Attr({}, \"{}\")".format(
-                        utils.ast_to_source_code(value).strip(), attr
-                    )
+                    f"_jst.Attr({utils.ast_to_source_code(value).strip()}, \"{attr}\")"
                 )
                 .body[0]
                 .value
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 1082897ed8520..47618392175d9 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -570,7 +570,7 @@ def convert_zip(*args):
         if isinstance(arg, Variable) and arg.shape[0] == -1:
             raise RuntimeError(
                 "Not support zip(tensor, ...) when tensor.shape[0] == -1, "
-                "but found args[{}].shape[0] == -1 in 'zip'".format(str(i))
+                f"but found args[{str(i)}].shape[0] == -1 in 'zip'"
             )
     return zip(*args)
 
diff --git a/python/paddle/jit/dy2static/decorator_transformer.py b/python/paddle/jit/dy2static/decorator_transformer.py
index a61c25dd6082e..3de950d0478aa 100644
--- a/python/paddle/jit/dy2static/decorator_transformer.py
+++ b/python/paddle/jit/dy2static/decorator_transformer.py
@@ -82,9 +82,7 @@ def visit_FunctionDef(self, node):
                 # match case like:
                 # @a.d.g.deco
                 re_tmp = re.match(
-                    r'({module})*({name})$'.format(
-                        name=RE_PYNAME, module=RE_PYMODULE
-                    ),
+                    fr'({RE_PYMODULE})*({RE_PYNAME})$',
                     deco_full_name,
                 )
                 deco_name = re_tmp.group(2)
diff --git a/python/paddle/jit/dy2static/error.py b/python/paddle/jit/dy2static/error.py
index 5cad201d9ebf1..96124f1369087 100644
--- a/python/paddle/jit/dy2static/error.py
+++ b/python/paddle/jit/dy2static/error.py
@@ -203,9 +203,7 @@ def numpy_api_check(self, format_exception, error_line):
         func_str = None
         for frame in tb:
             searched_name = re.search(
-                r'({module})*{name}'.format(
-                    module=RE_PYMODULE, name=frame.name
-                ),
+                fr'({RE_PYMODULE})*{frame.name}',
                 error_line,
             )
             if searched_name:
@@ -339,9 +337,7 @@ def _create_revise_suggestion(self, bottom_error_message):
                 for suggestion in self.suggestion_dict[keywords]:
                     suggestion_msg = (
                         ' ' * BLANK_COUNT_BEFORE_FILE_STR * 2
-                        + '{}. {}'.format(
-                            str(len(revise_suggestions) - 1), suggestion
-                        )
+                        + f'{str(len(revise_suggestions) - 1)}. {suggestion}'
                     )
                     revise_suggestions.append(suggestion_msg)
         return revise_suggestions if len(revise_suggestions) > 2 else []
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index e2966e4097d86..ec835ee6e9540 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -395,9 +395,7 @@ def convert_to_input_spec(inputs, input_spec):
     def check_type_and_len(input, spec, check_length=False):
         if type(input) is not type(spec):
             raise TypeError(
-                'type(input) should be {}, but received {}.'.format(
-                    type(spec), type(input)
-                )
+                f'type(input) should be {type(spec)}, but received {type(input)}.'
             )
         if check_length and len(input) < len(spec):
             raise ValueError(
@@ -444,9 +442,7 @@ def check_type_and_len(input, spec, check_length=False):
         real_spec = _replace_value_with_input_spec([inputs])[0]
         if not isinstance(real_spec, paddle.static.InputSpec):
             raise RuntimeError(
-                "Give input spec into a non-tensorable arguments `{}`.".format(
-                    inputs
-                )
+                f"Give input spec into a non-tensorable arguments `{inputs}`."
             )
         real_spec.name = input_spec.name
         if spec_greater(input_spec, real_spec):
diff --git a/python/paddle/jit/dy2static/logical_transformer.py b/python/paddle/jit/dy2static/logical_transformer.py
index 90002c6e4bd0d..c2719d2c177f1 100644
--- a/python/paddle/jit/dy2static/logical_transformer.py
+++ b/python/paddle/jit/dy2static/logical_transformer.py
@@ -85,9 +85,7 @@ def _create_bool_op_node(self, nodes, api_type):
         '''
         assert (
             len(nodes) > 1
-        ), "The length of BoolOp should be at least 2, but received {}.".format(
-            len(nodes)
-        )
+        ), f"The length of BoolOp should be at least 2, but received {len(nodes)}."
         if len(nodes) > 2:
             # Creates logic_and/logic_or node recursively.
             pre_logic_node = self._create_bool_op_node(nodes[:2], api_type)
@@ -98,9 +96,7 @@ def _create_bool_op_node(self, nodes, api_type):
             nodes = [pre_logic_node] + [post_logic_node]
 
         args = [ast_to_source_code(child) for child in nodes]
-        new_node_str = "_jst.{}(lambda:{}, lambda:{})".format(
-            api_type, args[0], args[1]
-        )
+        new_node_str = f"_jst.{api_type}(lambda:{args[0]}, lambda:{args[1]})"
         # NOTE: gast.parse return Module(body=[expr(...)])
         new_node = gast.parse(new_node_str).body[0].value
         return new_node
diff --git a/python/paddle/jit/dy2static/origin_info.py b/python/paddle/jit/dy2static/origin_info.py
index d6fee112ded91..d23d8a58431a8 100644
--- a/python/paddle/jit/dy2static/origin_info.py
+++ b/python/paddle/jit/dy2static/origin_info.py
@@ -41,9 +41,7 @@ def __init__(self, filepath, lineno, col_offset=None):
         self.col_offset = col_offset
 
     def __str__(self):
-        return "location: {}:{}:{}".format(
-            self.filepath, self.lineno, self.col_offset
-        )
+        return f"location: {self.filepath}:{self.lineno}:{self.col_offset}"
 
     @property
     def line_location(self):
@@ -311,9 +309,7 @@ def get_new_op_callstack(callstack):
             if dygraph_func_info:
                 filepath, lineno, funcname, code = dygraph_func_info.as_frame()
 
-            callstack[i] = '  File "{}", line {}, in {}'.format(
-                filepath, lineno, funcname
-            )
+            callstack[i] = f'  File "{filepath}", line {lineno}, in {funcname}'
             callstack[i + 1] = f'    {code}'
 
         return callstack
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 1bdf46629dfd0..013fff9178e7c 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -333,8 +333,8 @@ def to_static_api(dygraph_class):
         return dygraph_class_to_static_api[dygraph_class]
     else:
         raise NotImplementedError(
-            "Paddle dygraph API {} cannot be converted "
-            "to static graph at present.".format(dygraph_class)
+            f"Paddle dygraph API {dygraph_class} cannot be converted "
+            "to static graph at present."
         )
 
 
@@ -1303,12 +1303,10 @@ def get_args_0():
     """
 
     def empty_node():
-        func_def = """
-        def {func_name}():
+        func_def = f"""
+        def {unique_name.generate(GET_ARGS_FUNC_PREFIX)}():
             return
-        """.format(
-            func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX)
-        )
+        """
         return gast.parse(textwrap.dedent(func_def)).body[0]
 
     assert isinstance(names, (list, tuple))
@@ -1342,12 +1340,10 @@ def set_args_0(__args):
     """
 
     def empty_node():
-        func_def = """
-        def {func_name}({args}):
+        func_def = f"""
+        def {unique_name.generate(SET_ARGS_FUNC_PREFIX)}({ARGS_NAME}):
             pass
-        """.format(
-            func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX), args=ARGS_NAME
-        )
+        """
         return gast.parse(textwrap.dedent(func_def)).body[0]
 
     assert isinstance(names, (list, tuple))
@@ -1416,9 +1412,7 @@ def get(self, names):
         for n in names:
             assert (
                 n in self.name2id
-            ), "the name `{}` not in name union set`{}`.".format(
-                n, self.name2id.keys()
-            )
+            ), f"the name `{n}` not in name union set`{self.name2id.keys()}`."
         return tuple(vars[self.name2id[n]] for n in names)
 
     def set(self, names, values):
@@ -1432,9 +1426,7 @@ def set(self, names, values):
         for n in names:
             assert (
                 n in self.name2id
-            ), "the name `{}` not in name union set`{}`.".format(
-                n, self.name2id.keys()
-            )
+            ), f"the name `{n}` not in name union set`{self.name2id.keys()}`."
         vars = list(vars)
         indices = [self.name2id[n] for n in names]
         for i, v in zip(indices, values):
diff --git a/python/paddle/jit/dy2static/variable_trans_func.py b/python/paddle/jit/dy2static/variable_trans_func.py
index ee358d57ee019..b32001dd28f7b 100644
--- a/python/paddle/jit/dy2static/variable_trans_func.py
+++ b/python/paddle/jit/dy2static/variable_trans_func.py
@@ -29,20 +29,14 @@ def create_undefined_var(name):
 def create_fill_constant_node(name, value=0):
     func_code = f"{name} = paddle.full(shape=[1], "
     if isinstance(value, bool):
-        func_code += "dtype='bool', fill_value={}, name='{}')".format(
-            value, name
-        )
+        func_code += f"dtype='bool', fill_value={value}, name='{name}')"
         return gast.parse(func_code).body[0]
     if isinstance(value, float):
-        func_code += "dtype='float64', fill_value={}, name='{}')".format(
-            value, name
-        )
+        func_code += f"dtype='float64', fill_value={value}, name='{name}')"
         return gast.parse(func_code).body[0]
 
     if isinstance(value, int):
-        func_code += "dtype='int64', fill_value={}, name='{}')".format(
-            value, name
-        )
+        func_code += f"dtype='int64', fill_value={value}, name='{name}')"
         return gast.parse(func_code).body[0]
 
 
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 9cac2ff006c72..2760b448a7027 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -120,9 +120,7 @@ def reset(self):
         Reset states and result
         """
         raise NotImplementedError(
-            "function 'reset' not implemented in {}.".format(
-                self.__class__.__name__
-            )
+            f"function 'reset' not implemented in {self.__class__.__name__}."
         )
 
     @abc.abstractmethod
@@ -138,9 +136,7 @@ def update(self, *args):
         see :code:`Metric.compute`
         """
         raise NotImplementedError(
-            "function 'update' not implemented in {}.".format(
-                self.__class__.__name__
-            )
+            f"function 'update' not implemented in {self.__class__.__name__}."
         )
 
     @abc.abstractmethod
@@ -149,9 +145,7 @@ def accumulate(self):
         Accumulates statistics, computes and returns the metric value
         """
         raise NotImplementedError(
-            "function 'accumulate' not implemented in {}.".format(
-                self.__class__.__name__
-            )
+            f"function 'accumulate' not implemented in {self.__class__.__name__}."
         )
 
     @abc.abstractmethod
@@ -160,9 +154,7 @@ def name(self):
         Returns metric name
         """
         raise NotImplementedError(
-            "function 'name' not implemented in {}.".format(
-                self.__class__.__name__
-            )
+            f"function 'name' not implemented in {self.__class__.__name__}."
         )
 
     def compute(self, *args):
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index e02a47d7bf8dd..58364cc6d78a5 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -564,7 +564,7 @@ def prelu(x, weight, data_format="NCHW", name=None):
         if data_format not in true_data_format:
             raise ValueError(
                 "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+                f"'NLC', 'NHWC', 'NDHWC' but receive {data_format}"
             )
 
         data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
@@ -700,9 +700,7 @@ def rrelu(x, lower=1.0 / 8.0, upper=1.0 / 3.0, training=True, name=None):
 
     if upper > 1:
         raise ValueError(
-            "The upper value must be no greater than one. Received: {}.".format(
-                upper
-            )
+            f"The upper value must be no greater than one. Received: {upper}."
         )
 
     is_test = not training
@@ -1344,9 +1342,7 @@ def softshrink(x, threshold=0.5, name=None):
     """
     if threshold < 0:
         raise ValueError(
-            "The threshold must be no less than zero. Received: {}.".format(
-                threshold
-            )
+            f"The threshold must be no less than zero. Received: {threshold}."
         )
 
     if in_dynamic_mode():
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 9b1da0dd36802..de606db3c39d1 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -592,8 +592,8 @@ def _is_list_or_turple_(data):
         elif isinstance(scale, (list, tuple)):
             if len(scale) != len(x.shape) - 2:
                 raise ValueError(
-                    "scale_shape length should be {} for "
-                    "input {}-D tensor.".format(len(x.shape) - 2, len(x.shape))
+                    f"scale_shape length should be {len(x.shape) - 2} for "
+                    f"input {len(x.shape)}-D tensor."
                 )
             for value in scale:
                 if value <= 0:
@@ -1366,9 +1366,7 @@ def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
     input_shape = x.shape
     if len(input_shape) != 4:
         raise ValueError(
-            "dimensions of x should be 4, but received {} != 4".format(
-                len(input_shape)
-            )
+            f"dimensions of x should be 4, but received {len(input_shape)} != 4"
         )
 
     if data_format not in ["NCHW", "NHWC"]:
@@ -1424,9 +1422,7 @@ def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
     input_shape = x.shape
     if len(input_shape) != 5:
         raise ValueError(
-            "dimensions of x should be 5, but received {} != 5".format(
-                len(input_shape)
-            )
+            f"dimensions of x should be 5, but received {len(input_shape)} != 5"
         )
 
     if data_format not in ["NCDHW", "NDHWC"]:
@@ -1644,14 +1640,12 @@ def pad(x, pad, mode='constant', value=0.0, data_format="NCHW", name=None):
         'replicate',
         'constant',
         'circular',
-    ], "mode should be one of constant, reflect, replicate, circular, but got {}.".format(
-        mode
-    )
+    ], f"mode should be one of constant, reflect, replicate, circular, but got {mode}."
 
     data_format = data_format.upper()
     assert data_format in ["NCL", "NCHW", "NCDHW", "NLC", "NHWC", "NDHWC"], (
         "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], "
-        "but got {}".format(data_format)
+        f"but got {data_format}"
     )
 
     x_dim = len(x.shape)
@@ -2213,19 +2207,15 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
         label_size *= dim
     if label_size != -1 and label_size < 1:
         raise ValueError(
-            'Expected label_size > 0 \
-             (got label_size: {})'.format(
-                label_size
-            )
+            f'Expected label_size > 0 \
+             (got label_size: {label_size})'
         )
 
     label_dims = len(list(label.shape))
     if label_dims != 1:
         raise ValueError(
-            'Expected label_dims == 1 \
-             (got label_dims: {})'.format(
-                label_dims
-            )
+            f'Expected label_dims == 1 \
+             (got label_dims: {label_dims})'
         )
 
     seed = None
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 00e45f789ae5e..55c8776e9ffbd 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -60,9 +60,7 @@ def _update_padding_nd(padding, channel_last, num_dims):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".format(
-                    padding
-                )
+                f"Unknown padding: '{padding}'. It can only be 'SAME' or 'VALID'."
             )
         if padding == "VALID":
             padding_algorithm = "VALID"
@@ -77,8 +75,8 @@ def _update_padding_nd(padding, channel_last, num_dims):
         if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
             if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
-                    "Non-zero padding({}) in the batch or channel dimensions "
-                    "is not supported.".format(padding)
+                    f"Non-zero padding({padding}) in the batch or channel dimensions "
+                    "is not supported."
                 )
             padding_algorithm = "EXPLICIT"
             padding = _exclude_padding_in_batch_and_channel(
@@ -396,7 +394,7 @@ def conv1d(
     if data_format not in ["NCL", "NLC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCL' or 'NLC'. "
-            "Received Attr(data_format): {}.".format(data_format)
+            f"Received Attr(data_format): {data_format}."
         )
 
     channel_last = data_format == "NLC"
@@ -404,22 +402,18 @@ def conv1d(
     conv2d_data_format = "NHWC" if channel_last else "NCHW"
     if len(x.shape) != 3:
         raise ValueError(
-            "Input x should be 3D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 3D tensor, but received x with the shape of {x.shape}"
         )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) "
-            "should be defined. Received: {}.".format(x.shape, num_channels)
+            f"The channel dimension of the input({x.shape}) "
+            f"should be defined. Received: {num_channels}."
         )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv1d should be greater than 0. Received groups: {}".format(
-                groups
-            )
+            f"The groups of conv1d should be greater than 0. Received groups: {groups}"
         )
     if num_channels % groups != 0:
         raise ValueError(
@@ -647,29 +641,25 @@ def conv2d(
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. "
-            "Received Attr(data_format): {}.".format(data_format)
+            f"Received Attr(data_format): {data_format}."
         )
 
     channel_last = data_format == "NHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 4D tensor, but received x with the shape of {x.shape}"
         )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) "
-            "should be defined. Received: {}.".format(x.shape, num_channels)
+            f"The channel dimension of the input({x.shape}) "
+            f"should be defined. Received: {num_channels}."
         )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv2d should be greater than 0. Received groups: {}".format(
-                groups
-            )
+            f"The groups of conv2d should be greater than 0. Received groups: {groups}"
         )
     if num_channels % groups != 0:
         raise ValueError(
@@ -911,24 +901,20 @@ def conv1d_transpose(
     if data_format not in ['NCL', 'NLC']:
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
-            "received {}, but only 'NCL' or 'NLC' are supported.".format(
-                data_format
-            )
+            f"received {data_format}, but only 'NCL' or 'NLC' are supported."
         )
     channel_last = data_format == "NLC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 3:
         raise ValueError(
-            "Input x should be 3D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 3D tensor, but received x with the shape of {x.shape}"
         )
 
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) "
-            "should be defined. Received: {}.".format(x.shape, num_channels)
+            f"The channel dimension of the input({x.shape}) "
+            f"should be defined. Received: {num_channels}."
         )
     if groups <= 0:
         raise ValueError(
@@ -952,9 +938,7 @@ def conv1d_transpose(
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimension should 1 or 2. But got padding={}".format(
-                padding
-            )
+            f"The size of padding's dimension should 1 or 2. But got padding={padding}"
         )
 
     stride = convert_to_list(stride, 1, 'stride') + [1]
@@ -985,9 +969,7 @@ def conv1d_transpose(
     if len(output_padding) > 0 and output_padding[0] > stride[0]:
         raise ValueError(
             "The size of output_padding should not be greater than stride."
-            "But got output_padding={} and stride={}".format(
-                output_padding[0], stride[0]
-            )
+            f"But got output_padding={output_padding[0]} and stride={stride[0]}"
         )
 
     if len(weight.shape) != 3:
@@ -1198,17 +1180,13 @@ def conv2d_transpose(
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
-            "received {}, but only 'NCHW' or 'NHWC' are supported.".format(
-                data_format
-            )
+            f"received {data_format}, but only 'NCHW' or 'NHWC' are supported."
         )
     channel_last = data_format == "NHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 4D tensor, but received x with the shape of {x.shape}"
         )
     if len(weight.shape) != 4:
         raise ValueError(
@@ -1219,8 +1197,8 @@ def conv2d_transpose(
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) "
-            "should be defined. Received: {}.".format(x.shape, num_channels)
+            f"The channel dimension of the input({x.shape}) "
+            f"should be defined. Received: {num_channels}."
         )
     if groups <= 0:
         raise ValueError(
@@ -1491,43 +1469,35 @@ def conv3d(
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format)
+            f"Attr(data_format): {data_format}."
         )
 
     channel_last = data_format == "NDHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 5D tensor, but received x with the shape of {x.shape}"
         )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels)
+            f"The channel dimension of the input({x.shape}) should be defined. "
+            f"Received: {num_channels}."
         )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv3d should be greater than 0. Received groups: {}".format(
-                groups
-            )
+            f"The groups of conv3d should be greater than 0. Received groups: {groups}"
         )
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
-            "Received: number of channels({}), groups({}).".format(
-                num_channels, groups
-            )
+            f"Received: number of channels({num_channels}), groups({groups})."
         )
     if num_filters % groups != 0:
         raise ValueError(
             "The number of filters must be divisible by Attr(groups). "
-            "Received: number of filters({}), groups({}).".format(
-                num_filters, groups
-            )
+            f"Received: number of filters({num_filters}), groups({groups})."
         )
 
     cudnn_version = get_cudnn_version()
@@ -1705,16 +1675,14 @@ def conv3d_transpose(
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format)
+            f"Attr(data_format): {data_format}."
         )
 
     channel_last = data_format == "NDHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 5D tensor, but received x with the shape of {x.shape}"
         )
     if len(weight.shape) != 5:
         raise ValueError(
@@ -1726,8 +1694,8 @@ def conv3d_transpose(
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels)
+            f"The channel dimension of the input({x.shape}) should be defined. "
+            f"Received: {num_channels}."
         )
     if groups <= 0:
         raise ValueError(
@@ -1738,9 +1706,7 @@ def conv3d_transpose(
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
-            "Received: number of channels({}), groups({}).".format(
-                num_channels, groups
-            )
+            f"Received: number of channels({num_channels}), groups({groups})."
         )
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 252afc268bf7c..757c9059efdd6 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -405,7 +405,7 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. "
-            "Received Attr(data_format): {}.".format(data_format)
+            f"Received Attr(data_format): {data_format}."
         )
     if in_dynamic_mode():
         return _C_ops.temporal_shift(x, seg_num, shift_ratio, data_format)
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 0a714bce0b992..e38797a1115ae 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -219,9 +219,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
 
     if padding_idx >= weight.shape[0] or padding_idx < -weight.shape[0]:
         raise ValueError(
-            "padding_idx must be within [-{}, {})".format(
-                weight.shape[0], weight.shape[0]
-            )
+            f"padding_idx must be within [-{weight.shape[0]}, {weight.shape[0]})"
         )
 
     if in_dynamic_or_pir_mode():
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 9bf445054cd14..e74e67d83f88e 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -262,10 +262,8 @@ def base_softmax_with_cross_entropy(
     label_dims = len(list(label.shape))
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
-            'Expected input_dims - 1 = label_dims or input_dims == label_dims\
-             (got input_dims{}, label_dims{})'.format(
-                input_dims, label_dims
-            )
+            f'Expected input_dims - 1 = label_dims or input_dims == label_dims\
+             (got input_dims{input_dims}, label_dims{label_dims})'
         )
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
@@ -1424,10 +1422,8 @@ def nll_loss(
 
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
-            "Expected input_dims - 1 = label_dims or input_dims == label_dims\
-             (got input_dims{}, label_dims{})".format(
-                input_dims, label_dims
-            )
+            f"Expected input_dims - 1 = label_dims or input_dims == label_dims\
+             (got input_dims{input_dims}, label_dims{label_dims})"
         )
 
     if input_dims < 2:
@@ -1435,9 +1431,7 @@ def nll_loss(
 
     if input_shape[1] < 1:
         raise ValueError(
-            "Expected 1 or more classess (got num classes{})".format(
-                input_shape[1]
-            )
+            f"Expected 1 or more classess (got num classes{input_shape[1]})"
         )
 
     n = input_shape[0]
@@ -1781,7 +1775,7 @@ def mse_loss(input, label, reduction='mean', name=None):
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'mse_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction)
+            f"but received {reduction}."
         )
 
     if not in_dynamic_mode():
@@ -2309,10 +2303,8 @@ def margin_cross_entropy(
     label_dims = len(list(label.shape))
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
-            'Expected input_dims - 1 = label_dims or input_dims == label_dims\
-             (got input_dims{}, label_dims{})'.format(
-                input_dims, label_dims
-            )
+            f'Expected input_dims - 1 = label_dims or input_dims == label_dims\
+             (got input_dims{input_dims}, label_dims{label_dims})'
         )
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=-1)
@@ -2795,10 +2787,8 @@ def cross_entropy(
 
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
-            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
-             (got nput_dims{}, label_dims{})'.format(
-                input_dims, label_dims
-            )
+            f'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{input_dims}, label_dims{label_dims})'
         )
 
     if label_smoothing > 0.0:
@@ -2846,11 +2836,9 @@ def cross_entropy(
             else:
                 if input.shape[axis] != weight.shape[-1]:
                     raise ValueError(
-                        "input's class_dimension({}) must equal to "
-                        "weight's class_dimension({}) "
-                        "when weight is provided".format(
-                            input.shape[axis], weight.shape[-1]
-                        )
+                        f"input's class_dimension({input.shape[axis]}) must equal to "
+                        f"weight's class_dimension({weight.shape[-1]}) "
+                        "when weight is provided"
                     )
 
                 ignore_weight_mask = paddle.cast(
@@ -2993,11 +2981,9 @@ def cross_entropy(
             else:
                 if input.shape[axis] != weight.shape[-1]:
                     raise ValueError(
-                        "input's class_dimension({}) must equal to "
-                        "weight's class_dimension({}) "
-                        "when weight is provided".format(
-                            input.shape[axis], weight.shape[-1]
-                        )
+                        f"input's class_dimension({input.shape[axis]}) must equal to "
+                        f"weight's class_dimension({weight.shape[-1]}) "
+                        "when weight is provided"
                     )
 
                 valid_label = paddle.multiply(
@@ -3321,7 +3307,7 @@ def multi_label_soft_margin_loss(
     if not (input.shape == label.shape):
         raise ValueError(
             "The input and label should have same dimension,"
-            "but received {}!={}".format(input.shape, label.shape)
+            f"but received {input.shape}!={label.shape}"
         )
 
     if not in_dynamic_mode():
@@ -3442,7 +3428,7 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction)
+            f"but received {reduction}."
         )
 
     if not in_dynamic_mode():
@@ -3669,7 +3655,7 @@ def triplet_margin_with_distance_loss(
         raise ValueError(
             "'reduction' in 'triplet_margin_with_distance_loss' "
             "should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction)
+            f"but received {reduction}."
         )
     if margin < 0:
         raise ValueError(
@@ -3819,7 +3805,7 @@ def triplet_margin_loss(
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'triplet_margin_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction)
+            f"but received {reduction}."
         )
     if margin < 0:
         raise ValueError(
@@ -3934,7 +3920,7 @@ def multi_margin_loss(
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'multi_margin_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction)
+            f"but received {reduction}."
         )
 
     if not in_dynamic_mode():
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index a0184444e4611..704eb880c516c 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -183,7 +183,7 @@ def batch_norm(
     if data_format not in true_data_format:
         raise ValueError(
             "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-            "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+            f"'NLC', 'NHWC', 'NDHWC' but receive {data_format}"
         )
 
     data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
@@ -545,23 +545,21 @@ def local_response_norm(
     if data_format not in ['NCL', 'NLC', 'NCHW', 'NHWC', 'NCDHW', 'NDHWC']:
         raise ValueError(
             "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], "
-            "but got {}".format(data_format)
+            f"but got {data_format}"
         )
 
     sizes = x.shape
     dim = len(sizes)
     if dim < 3:
         raise ValueError(
-            'Expected 3D or higher dimensionality input, but got {} dimensions'.format(
-                dim
-            )
+            f'Expected 3D or higher dimensionality input, but got {dim} dimensions'
         )
 
     for i, sz in enumerate(sizes):
         if not sz > 0 and i > 0:
             raise ValueError(
                 "Expected every dim's size to be larger than 0, "
-                "but the size of the {}-th dim is {}".format(i, sz)
+                f"but the size of the {i}-th dim is {sz}"
             )
 
     channel_last = True if data_format[-1] == "C" else False
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 2642996ddee77..5e5bf191dfc38 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -48,9 +48,7 @@ def _check_input(x, dimension):
 def _check_instance(x, x_name, types=(int, float)):
     if not isinstance(x, types):
         raise ValueError(
-            "Excepted {} type for {} but received type: {}. ".format(
-                types, x_name, type(x)
-            )
+            f"Excepted {types} type for {x_name} but received type: {type(x)}. "
         )
 
 
@@ -112,9 +110,7 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".format(
-                    padding
-                )
+                f"Unknown padding: '{padding}'. It can only be 'SAME' or 'VALID'."
             )
         if padding == "VALID":
             if ceil_mode is not False:
@@ -135,8 +131,8 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
         if len(padding) == 2 + num_dims and _is_list_or_tuple(padding[0]):
             if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
-                    "Non-zero padding({}) in the batch or channel dimensions "
-                    "is not supported.".format(padding)
+                    f"Non-zero padding({padding}) in the batch or channel dimensions "
+                    "is not supported."
                 )
             padding_algorithm = "EXPLICIT"
             padding = _exclude_padding_in_batch_and_channel(
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index d49ca156f77ea..d41ccc975f191 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -267,9 +267,7 @@ def grid_sample(
     _padding_modes = ['zeros', 'reflection', 'border']
     if mode not in _modes:
         raise ValueError(
-            "The mode of grid sample function should be in {}, but got: {}".format(
-                _modes, mode
-            )
+            f"The mode of grid sample function should be in {_modes}, but got: {mode}"
         )
     if padding_mode not in _padding_modes:
         raise ValueError(
@@ -280,9 +278,7 @@ def grid_sample(
 
     if not isinstance(align_corners, bool):
         raise ValueError(
-            "The align corners should be bool, but got: {}".format(
-                align_corners
-            )
+            f"The align corners should be bool, but got: {align_corners}"
         )
 
     cudnn_version = get_cudnn_version()
@@ -371,7 +367,7 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'."
-            "But recevie Attr(data_format): {} ".format(data_format)
+            f"But recevie Attr(data_format): {data_format} "
         )
     if in_dygraph_mode():
         return _C_ops.pixel_shuffle(x, upscale_factor, data_format)
@@ -419,9 +415,7 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
     """
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 4D tensor, but received x with the shape of {x.shape}"
         )
 
     if not isinstance(downscale_factor, int):
@@ -433,7 +427,7 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'."
-            "But recevie Attr(data_format): {} ".format(data_format)
+            f"But recevie Attr(data_format): {data_format} "
         )
 
     if in_dygraph_mode():
@@ -499,9 +493,7 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None):
     """
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 4D tensor, but received x with the shape of {x.shape}"
         )
 
     if not isinstance(groups, int):
@@ -513,7 +505,7 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None):
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'."
-            "But recevie Attr(data_format): {} ".format(data_format)
+            f"But recevie Attr(data_format): {data_format} "
         )
 
     if in_dygraph_mode():
diff --git a/python/paddle/nn/initializer/initializer.py b/python/paddle/nn/initializer/initializer.py
index c77d6ae466637..6f37e95a79816 100644
--- a/python/paddle/nn/initializer/initializer.py
+++ b/python/paddle/nn/initializer/initializer.py
@@ -160,7 +160,5 @@ def calculate_gain(nonlinearity, param=None):
         return recommended_gain[nonlinearity]
     else:
         raise ValueError(
-            "nonlinearity function {} is not suppported now.".format(
-                nonlinearity
-            )
+            f"nonlinearity function {nonlinearity} is not suppported now."
         )
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 4bcb19ea95c74..d8a4d0e6fedd0 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -693,9 +693,7 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'scale={:.16f}, alpha={:.16f}{}'.format(
-            self._scale, self._alpha, name_str
-        )
+        return f'scale={self._scale:.16f}, alpha={self._alpha:.16f}{name_str}'
 
 
 class LeakyReLU(Layer):
@@ -890,9 +888,7 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'beta={}, threshold={}{}'.format(
-            self._beta, self._threshold, name_str
-        )
+        return f'beta={self._beta}, threshold={self._threshold}{name_str}'
 
 
 class Softshrink(Layer):
@@ -1546,9 +1542,7 @@ def __init__(self, name=None):
     def forward(self, x):
         assert (
             x.ndim == 3 or x.ndim == 4
-        ), "Softmax2D requires a 3D or 4D tensor as input. Received: {}D.".format(
-            x.ndim
-        )
+        ), f"Softmax2D requires a 3D or 4D tensor as input. Received: {x.ndim}D."
         return F.softmax(x, axis=-3, dtype=self._dtype, name=self._name)
 
     def extra_repr(self):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 0c55895d21253..f30ff14e502f1 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -517,9 +517,7 @@ def extra_repr(self):
         else:
             main_str = f'size={self.size}'
         name_str = f', name={self.name}' if self.name else ''
-        return '{}, data_format={}{}'.format(
-            main_str, self.data_format, name_str
-        )
+        return f'{main_str}, data_format={self.data_format}{name_str}'
 
 
 class UpsamplingBilinear2D(Layer):
@@ -606,9 +604,7 @@ def extra_repr(self):
         else:
             main_str = f'size={self.size}'
         name_str = f', name={self.name}' if self.name else ''
-        return '{}, data_format={}{}'.format(
-            main_str, self.data_format, name_str
-        )
+        return f'{main_str}, data_format={self.data_format}{name_str}'
 
 
 class Bilinear(Layer):
@@ -798,9 +794,7 @@ def forward(self, input):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'p={}, axis={}, mode={}{}'.format(
-            self.p, self.axis, self.mode, name_str
-        )
+        return f'p={self.p}, axis={self.axis}, mode={self.mode}{name_str}'
 
 
 class Dropout2D(Layer):
@@ -876,9 +870,7 @@ def forward(self, input):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'p={}, data_format={}{}'.format(
-            self.p, self.data_format, name_str
-        )
+        return f'p={self.p}, data_format={self.data_format}{name_str}'
 
 
 class Dropout3D(Layer):
@@ -956,9 +948,7 @@ def forward(self, input):
 
     def extra_repr(self):
         name_str = f', name={self.name}' if self.name else ''
-        return 'p={}, data_format={}{}'.format(
-            self.p, self.data_format, name_str
-        )
+        return f'p={self.p}, data_format={self.data_format}{name_str}'
 
 
 class AlphaDropout(Layer):
@@ -1224,9 +1214,7 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = f', name={self._name}' if self._name else ''
-        return 'padding={}, data_format={}{}'.format(
-            self._pad, self._data_format, name_str
-        )
+        return f'padding={self._pad}, data_format={self._data_format}{name_str}'
 
 
 class Pad3D(Layer):
@@ -1496,9 +1484,7 @@ def __init__(
 
         if padding_idx >= num_embeddings or padding_idx < -num_embeddings:
             raise ValueError(
-                "padding_idx must be within [-{}, {})".format(
-                    num_embeddings, num_embeddings
-                )
+                f"padding_idx must be within [-{num_embeddings}, {num_embeddings})"
             )
 
         self._dtype = self._helper.get_default_dtype()
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
index 09fefb227eceb..1f2986a6395d5 100644
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -505,9 +505,7 @@ def insert(self, index, sublayer):
         """
         assert isinstance(index, int) and -len(self._sub_layers) <= index < len(
             self._sub_layers
-        ), "index should be an integer in range [{}, {})".format(
-            -len(self), len(self)
-        )
+        ), f"index should be an integer in range [{-len(self)}, {len(self)})"
 
         index = self._get_abs_idx(index)
         for i in range(len(self._sub_layers), index, -1):
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index 623d8d0ed2140..9f4d7d037cf7f 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -1517,9 +1517,7 @@ def is_already_registered(is_pre_hook):
 
         if not isinstance(attrs, dict):
             raise TypeError(
-                "attrs should be type(dict), but received {}".format(
-                    type(attrs).__name__
-                )
+                f"attrs should be type(dict), but received {type(attrs).__name__}"
             )
 
         # NOTE: Overwrite behavior for same key.
@@ -1587,9 +1585,7 @@ def _remove_if_exist(*dicts):
             if len(self._loaddict_holder) > 0:
                 assert (
                     value.name in self._loaddict_holder
-                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
-                    value.name
-                )
+                ), f"Parameter not found, Can't not find [ {value.name} ] in state_dict"
 
                 value.set_value(self._loaddict_holder[value.name])
 
@@ -1964,10 +1960,8 @@ def _check_match(key, param):
                 if len(state) != len(param):
                     missing_keys.append(key)
                     raise ValueError(
-                        "{} receieves the length of {}, "
-                        "but the expected shape is {}".format(
-                            key, len(state), len(param)
-                        )
+                        f"{key} receieves the length of {len(state)}, "
+                        f"but the expected shape is {len(param)}"
                     )
                 else:
                     match_keys.add(key)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 77b8dbdaaaaad..f8382ab13fe0e 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -617,7 +617,7 @@ def __init__(self, reduction='mean'):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "'reduction' in 'MSELoss' should be 'sum', 'mean' or 'none', "
-                "but received {}.".format(reduction)
+                f"but received {reduction}."
             )
         self.reduction = reduction
 
@@ -2009,7 +2009,7 @@ def __init__(
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "'reduction' in 'MultiMarginLoss' should be 'sum', 'mean' or 'none', "
-                "but received {}.".format(reduction)
+                f"but received {reduction}."
             )
         self.p = p
         self.margin = margin
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 53e71bb26a3b3..9944a4b481126 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -109,9 +109,7 @@ def forward(self, input):
         )
 
     def extra_repr(self):
-        return 'num_features={}, epsilon={}'.format(
-            self._num_features, self._epsilon
-        )
+        return f'num_features={self._num_features}, epsilon={self._epsilon}'
 
 
 class InstanceNorm1D(_InstanceNormBase):
@@ -202,9 +200,7 @@ def __init__(
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
             raise ValueError(
-                'expected 2D or 3D input (got {}D input)'.format(
-                    len(input.shape)
-                )
+                f'expected 2D or 3D input (got {len(input.shape)}D input)'
             )
 
 
@@ -692,9 +688,7 @@ def forward(self, input):
         )
 
     def extra_repr(self):
-        return 'normalized_shape={}, epsilon={}'.format(
-            self._normalized_shape, self._epsilon
-        )
+        return f'normalized_shape={self._normalized_shape}, epsilon={self._epsilon}'
 
 
 class _BatchNormBase(Layer):
@@ -1279,9 +1273,7 @@ def _check_data_format(self, input):
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
             raise ValueError(
-                'expected 2D or 3D input (got {}D input)'.format(
-                    len(input.shape)
-                )
+                f'expected 2D or 3D input (got {len(input.shape)}D input)'
             )
 
 
@@ -1833,9 +1825,7 @@ def forward(self, input):
         return out
 
     def extra_repr(self):
-        main_str = 'size={}, alpha={}, beta={}, k={}'.format(
-            self.size, self.alpha, self.beta, self.k
-        )
+        main_str = f'size={self.size}, alpha={self.alpha}, beta={self.beta}, k={self.k}'
         if self.data_format != 'NCHW':
             main_str += f', data_format={self.data_format}'
         if self.name is not None:
@@ -1922,7 +1912,7 @@ def __init__(
         assert dim < len(self._weight_shape), (
             "The input `dim` should be less than the "
             "length of `weight_shape`, but received dim="
-            "{}".format(dim)
+            f"{dim}"
         )
         h = self._weight_shape[self._dim]
         w = np.prod(self._weight_shape) // h
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 5a8725771002c..3108aeebeded4 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -955,9 +955,7 @@ def forward(self, input):
         )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(
-            self.output_size, self.return_mask
-        )
+        return f'output_size={self.output_size}, return_mask={self.return_mask}'
 
 
 class AdaptiveMaxPool2D(Layer):
@@ -1041,8 +1039,8 @@ def forward(self, x):
         )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(
-            self._output_size, self._return_mask
+        return (
+            f'output_size={self._output_size}, return_mask={self._return_mask}'
         )
 
 
@@ -1138,8 +1136,8 @@ def forward(self, x):
         )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(
-            self._output_size, self._return_mask
+        return (
+            f'output_size={self._output_size}, return_mask={self._return_mask}'
         )
 
 
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 1cafb1cf1b614..aa7f6d91edbfa 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -805,7 +805,7 @@ def __init__(
         if activation not in ["tanh", "relu"]:
             raise ValueError(
                 "activation for SimpleRNNCell should be tanh or relu, "
-                "but get {}".format(activation)
+                f"but get {activation}"
             )
         self.activation = activation
         self._activation_fn = paddle.tanh if activation == "tanh" else F.relu
@@ -1285,10 +1285,8 @@ def __init__(self, cell_fw, cell_bw, time_major=False):
         self.cell_bw = cell_bw
         if cell_fw.input_size != cell_bw.input_size:
             raise ValueError(
-                "input size of forward cell({}) does not equals"
-                "that of backward cell({})".format(
-                    cell_fw.input_size, cell_bw.input_size
-                )
+                f"input size of forward cell({cell_fw.input_size}) does not equals"
+                f"that of backward cell({cell_bw.input_size})"
             )
         for cell in [self.cell_fw, self.cell_bw]:
             if not hasattr(cell, "call"):
@@ -1380,7 +1378,7 @@ def __init__(
         else:
             raise ValueError(
                 "direction should be forward or bidirect (or bidirectional), "
-                "received direction = {}".format(direction)
+                f"received direction = {direction}"
             )
 
         self.could_use_cudnn = True
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 5cd1902fce318..0c70922ce6a67 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -171,11 +171,11 @@ def __init__(
 
         assert embed_dim > 0, (
             "Expected embed_dim to be greater than 0, "
-            "but received {}".format(embed_dim)
+            f"but received {embed_dim}"
         )
         assert num_heads > 0, (
             "Expected num_heads to be greater than 0, "
-            "but received {}".format(num_heads)
+            f"but received {num_heads}"
         )
 
         self.embed_dim = embed_dim
@@ -524,19 +524,15 @@ def __init__(
 
         super().__init__()
 
-        assert (
-            d_model > 0
-        ), "Expected d_model to be greater than 0, " "but received {}".format(
-            d_model
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, " f"but received {d_model}"
         )
-        assert (
-            nhead > 0
-        ), "Expected nhead to be greater than 0, " "but received {}".format(
-            nhead
+        assert nhead > 0, (
+            "Expected nhead to be greater than 0, " f"but received {nhead}"
         )
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but received {}".format(dim_feedforward)
+            f"but received {dim_feedforward}"
         )
 
         attn_dropout = dropout if attn_dropout is None else attn_dropout
@@ -854,19 +850,15 @@ def __init__(
 
         super().__init__()
 
-        assert (
-            d_model > 0
-        ), "Expected d_model to be greater than 0, " "but received {}".format(
-            d_model
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, " f"but received {d_model}"
         )
-        assert (
-            nhead > 0
-        ), "Expected nhead to be greater than 0, " "but received {}".format(
-            nhead
+        assert nhead > 0, (
+            "Expected nhead to be greater than 0, " f"but received {nhead}"
         )
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but received {}".format(dim_feedforward)
+            f"but received {dim_feedforward}"
         )
 
         attn_dropout = dropout if attn_dropout is None else attn_dropout
@@ -1294,19 +1286,15 @@ def __init__(
     ):
         super().__init__()
 
-        assert (
-            d_model > 0
-        ), "Expected d_model to be greater than 0, " "but received {}".format(
-            d_model
+        assert d_model > 0, (
+            "Expected d_model to be greater than 0, " f"but received {d_model}"
         )
-        assert (
-            nhead > 0
-        ), "Expected nhead to be greater than 0, " "but received {}".format(
-            nhead
+        assert nhead > 0, (
+            "Expected nhead to be greater than 0, " f"but received {nhead}"
         )
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but received {}".format(dim_feedforward)
+            f"but received {dim_feedforward}"
         )
 
         if isinstance(bias_attr, (list, tuple)):
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index 996699e513c37..faa946dfab57b 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -66,7 +66,7 @@ def __init__(self, upscale_factor, data_format="NCHW", name=None):
         if data_format not in ["NCHW", "NHWC"]:
             raise ValueError(
                 "Data format should be 'NCHW' or 'NHWC'."
-                "But recevie data format: {}".format(data_format)
+                f"But recevie data format: {data_format}"
             )
 
         self._upscale_factor = upscale_factor
@@ -132,7 +132,7 @@ def __init__(self, downscale_factor, data_format="NCHW", name=None):
         if data_format not in ["NCHW", "NHWC"]:
             raise ValueError(
                 "Data format should be 'NCHW' or 'NHWC'."
-                "But recevie data format: {}".format(data_format)
+                f"But recevie data format: {data_format}"
             )
 
         self._downscale_factor = downscale_factor
@@ -211,7 +211,7 @@ def __init__(self, groups, data_format="NCHW", name=None):
         if data_format not in ["NCHW", "NHWC"]:
             raise ValueError(
                 "Data format should be 'NCHW' or 'NHWC'."
-                "But recevie data format: {}".format(data_format)
+                f"But recevie data format: {data_format}"
             )
 
         self._groups = groups
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 050f2a533f98d..4c39154c6ab0f 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -34,7 +34,7 @@ def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12):
         if n_power_iterations <= 0:
             raise ValueError(
                 'Expected n_power_iterations to be positive, but '
-                'got n_power_iterations={}'.format(n_power_iterations)
+                f'got n_power_iterations={n_power_iterations}'
             )
         self.n_power_iterations = n_power_iterations
         self.eps = eps
@@ -104,7 +104,7 @@ def apply(layer, name, n_power_iterations, dim, eps):
             if isinstance(hook, SpectralNorm) and hook.name == name:
                 raise RuntimeError(
                     "Cannot register two spectral_norm hooks on "
-                    "the same parameter {}".format(name)
+                    f"the same parameter {name}"
                 )
 
         fn = SpectralNorm(name, n_power_iterations, dim, eps)
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 4ef5fdf2deefd..acbb6a57289d9 100644
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -113,7 +113,7 @@ def apply(layer, name, dim):
             if isinstance(hook, WeightNorm) and hook.name == name:
                 raise RuntimeError(
                     "Cannot register two weight_norm hooks on "
-                    "the same parameter {}".format(name)
+                    f"the same parameter {name}"
                 )
 
         if dim is None:
diff --git a/python/paddle/onnx/export.py b/python/paddle/onnx/export.py
index c108e56ebee3f..433d6147847ea 100644
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
@@ -95,7 +95,7 @@ def export(layer, path, input_spec=None, opset_version=9, **configs):
         raise ValueError(
             "The input path MUST be format of dirname/file_prefix "
             "[dirname\\file_prefix in Windows system], but "
-            "the file_prefix is empty in received path: {}".format(path)
+            f"the file_prefix is empty in received path: {path}"
         )
     save_file = path + '.onnx'
 
@@ -104,5 +104,5 @@ def export(layer, path, input_spec=None, opset_version=9, **configs):
         save_file,
         input_spec=input_spec,
         opset_version=opset_version,
-        **configs
+        **configs,
     )
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 113b95eec2598..6fb777447f8a1 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1889,9 +1889,7 @@ def __init__(
         # Check type and value of total_steps
         if not isinstance(total_steps, int):
             raise TypeError(
-                "'total_step' must be 'int', but received {}".format(
-                    type(total_steps)
-                )
+                f"'total_step' must be 'int', but received {type(total_steps)}"
             )
         if total_steps <= 0:
             raise ValueError("'total_step' must be a positive integer.")
@@ -1900,15 +1898,11 @@ def __init__(
         # Check type and value of pac_start
         if not isinstance(phase_pct, float):
             raise TypeError(
-                "'phase_pct' must be 'float', but received {}".format(
-                    type(phase_pct)
-                )
+                f"'phase_pct' must be 'float', but received {type(phase_pct)}"
             )
         if phase_pct < 0 or phase_pct > 1:
             raise ValueError(
-                "'phase_pct' must be between 0 and 1, but received {}".format(
-                    phase_pct
-                )
+                f"'phase_pct' must be between 0 and 1, but received {phase_pct}"
             )
 
         # Check type and value of divide_factor
@@ -2163,9 +2157,7 @@ def __init__(
         # check type of exp_gamma
         if not isinstance(exp_gamma, float):
             raise TypeError(
-                "The type of 'exp_gamma' must be float, but received {}".format(
-                    type(exp_gamma)
-                )
+                f"The type of 'exp_gamma' must be float, but received {type(exp_gamma)}"
             )
 
         step_size_up = float(step_size_up)
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 45a1069750bd9..cc758a4d2159d 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -396,9 +396,7 @@ def set_state_dict(self, state_dict):
                     load_para_np = load_para
                 else:
                     raise RuntimeError(
-                        "State dict type {} not supprt".format(
-                            str(type(load_para))
-                        )
+                        f"State dict type {str(type(load_para))} not supprt"
                     )
 
                 assert (
@@ -844,9 +842,7 @@ def _add_accumulator(
             if framework.in_dygraph_mode():
                 return self._accumulators[name][param.name]
             raise Exception(
-                "Accumulator {} already exists for parameter {}".format(
-                    name, param.name
-                )
+                f"Accumulator {name} already exists for parameter {param.name}"
             )
         if shape is None:
             shape = param.shape
@@ -892,9 +888,7 @@ def _add_accumulator(
             if len(self._accumulators_holder) > 0:
                 assert (
                     var_name in self._accumulators_holder
-                ), "Optimizer set error, {} should in state dict".format(
-                    var_name
-                )
+                ), f"Optimizer set error, {var_name} should in state dict"
                 var.set_value(self._accumulators_holder.pop(var_name))
 
         self._accumulators[name][param.name] = var
@@ -917,9 +911,7 @@ def _get_accumulator(self, name, param):
             or param.name not in self._accumulators[name]
         ):
             raise Exception(
-                "Accumulator {} does not exist for parameter {}".format(
-                    name, param.name
-                )
+                f"Accumulator {name} does not exist for parameter {param.name}"
             )
         return self._accumulators[name][param.name]
 
@@ -945,9 +937,7 @@ def _get_accumulator_master(self, name, param):
             or target_name not in self._accumulators[name]
         ):
             raise Exception(
-                "Accumulator {} does not exist for parameter {}".format(
-                    name, target_name
-                )
+                f"Accumulator {name} does not exist for parameter {target_name}"
             )
         return self._accumulators[name][target_name]
 
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 63fcaeffebcd4..e6b4f2b9347c3 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -249,17 +249,13 @@ def export_chrome_tracing(
             os.makedirs(dir_name, exist_ok=True)
         except Exception:
             raise RuntimeError(
-                "Can not create directory '{}' for saving profiling results.".format(
-                    dir_name
-                )
+                f"Can not create directory '{dir_name}' for saving profiling results."
             )
 
     def handle_fn(prof):
         nonlocal worker_name
         if not worker_name:
-            worker_name = "host_{}pid_{}".format(
-                socket.gethostname(), str(os.getpid())
-            )
+            worker_name = f"host_{socket.gethostname()}pid_{str(os.getpid())}"
         now = datetime.datetime.now()
         filename = '{}_time_{}.paddle_trace.json'.format(
             worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f')
@@ -307,17 +303,13 @@ def export_protobuf(
             os.makedirs(dir_name, exist_ok=True)
         except Exception:
             raise RuntimeError(
-                "Can not create directory '{}' for saving profiling results.".format(
-                    dir_name
-                )
+                f"Can not create directory '{dir_name}' for saving profiling results."
             )
 
     def handle_fn(prof):
         nonlocal worker_name
         if not worker_name:
-            worker_name = "host_{}pid_{}".format(
-                socket.gethostname(), str(os.getpid())
-            )
+            worker_name = f"host_{socket.gethostname()}pid_{str(os.getpid())}"
         now = datetime.datetime.now()
         filename = '{}_time_{}.paddle_trace.pb'.format(
             worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f')
@@ -501,9 +493,7 @@ def __init__(
                 if target not in supported_targets:
                     self.targets.remove(target)
                     warn(
-                        "Profiling {} is not supported in current context.".format(
-                            target
-                        )
+                        f"Profiling {target} is not supported in current context."
                     )
         else:
             self.targets = supported_targets
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 23c38d804f1a1..cbedd7d30a627 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -276,9 +276,7 @@ def print_layer_tree(node, depth=0):
             flops_n = _format_large_number(node.flops)
             flops_s = _format_large_number(node.flops * 1e9 / node.cpu_time)
             ret.append(
-                "{}{} latency: {}, FLOPs: {}, FLOPS: {}\n".format(
-                    align, name, tm, flops_n, flops_s
-                )
+                f"{align}{name} latency: {tm}, FLOPs: {flops_n}, FLOPS: {flops_s}\n"
             )
 
     for n in node[1:]:
diff --git a/python/paddle/profiler/timer.py b/python/paddle/profiler/timer.py
index 3fd4eeafde36f..eace45478d38c 100644
--- a/python/paddle/profiler/timer.py
+++ b/python/paddle/profiler/timer.py
@@ -388,9 +388,8 @@ def step_info(self, unit):
             message += ' {}: {:.5f} s'.format('batch_cost', batch_average)
         speed_average = self.current_event.speed_average()
         if speed_average:
-            message += ' ips: {:.3f} {}'.format(
-                speed_average,
-                self.current_event.speed_unit,
+            message += (
+                f' ips: {speed_average:.3f} {self.current_event.speed_unit}'
             )
         self.current_event.reset()
         return message
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index a6aa6f112d3dd..8e64bc2e3400a 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -374,9 +374,7 @@ def stft(
         assert pad_mode in [
             'constant',
             'reflect',
-        ], 'pad_mode should be "reflect" or "constant", but got "{}".'.format(
-            pad_mode
-        )
+        ], f'pad_mode should be "reflect" or "constant", but got "{pad_mode}".'
 
         pad_length = n_fft // 2
         # FIXME: Input `x` can be a complex tensor but pad does not support complex input.
@@ -517,9 +515,7 @@ def istft(
     assert x_rank in [
         2,
         3,
-    ], 'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(
-        x_rank
-    )
+    ], f'x should be a 2D or 3D complex tensor, but got rank of x is {x_rank}'
 
     if x_rank == 2:  # (batch, n_fft, n_frames)
         x = x.unsqueeze(0)
@@ -533,15 +529,11 @@ def istft(
     # Assure no gaps between frames.
     assert (
         0 < hop_length <= win_length
-    ), 'hop_length should be in (0, win_length({})], but got {}.'.format(
-        win_length, hop_length
-    )
+    ), f'hop_length should be in (0, win_length({win_length})], but got {hop_length}.'
 
     assert (
         0 < win_length <= n_fft
-    ), 'win_length should be in (0, n_fft({})], but got {}.'.format(
-        n_fft, win_length
-    )
+    ), f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
 
     n_frames = x.shape[-1]
     fft_size = x.shape[-2]
diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index 9d567496c5da5..f5a7fe306723f 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -155,9 +155,7 @@ def sparse_coo_tensor(
             shape = list(shape)
             if shape < min_shape:
                 raise ValueError(
-                    "the minimun shape required is {}, but get {}".format(
-                        min_shape, shape
-                    )
+                    f"the minimun shape required is {min_shape}, but get {shape}"
                 )
             if len(shape) != sparse_dim + dense_dim:
                 raise ValueError(
@@ -247,9 +245,7 @@ def sparse_csr_tensor(
 
     if len(shape) != 2 and len(shape) != 3:
         raise ValueError(
-            "SparseCsrTensor only support 2-D or 3-D matrix. but get shape {}".format(
-                shape
-            )
+            f"SparseCsrTensor only support 2-D or 3-D matrix. but get shape {shape}"
         )
     rows = shape[len(shape) - 2]
 
diff --git a/python/paddle/sparse/nn/functional/conv.py b/python/paddle/sparse/nn/functional/conv.py
index 91fe020eeea47..ccbe8ca8f003e 100644
--- a/python/paddle/sparse/nn/functional/conv.py
+++ b/python/paddle/sparse/nn/functional/conv.py
@@ -43,28 +43,24 @@ def _conv3d(
     if data_format not in ["NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format)
+            f"Attr(data_format): {data_format}."
         )
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 5D tensor, but received x with the shape of {x.shape}"
         )
 
     channel_last = data_format == "NDHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 5D tensor, but received x with the shape of {x.shape}"
         )
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels)
+            f"The channel dimension of the input({x.shape}) should be defined. "
+            f"Received: {num_channels}."
         )
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, dims)
@@ -136,28 +132,24 @@ def _conv2d(
     if data_format not in ["NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NHWC'. Received "
-            "Attr(data_format): {}.".format(data_format)
+            f"Attr(data_format): {data_format}."
         )
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 4D tensor, but received x with the shape of {x.shape}"
         )
 
     channel_last = data_format == "NHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".format(
-                x.shape
-            )
+            f"Input x should be 4D tensor, but received x with the shape of {x.shape}"
         )
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError(
-            "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels)
+            f"The channel dimension of the input({x.shape}) should be defined. "
+            f"Received: {num_channels}."
         )
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, dims)
diff --git a/python/paddle/sparse/unary.py b/python/paddle/sparse/unary.py
index c49a7b03d26b4..9f9bafc135b63 100644
--- a/python/paddle/sparse/unary.py
+++ b/python/paddle/sparse/unary.py
@@ -1201,8 +1201,8 @@ def svd_lowrank(x, q=6, niter=2, M=None):
         q = min(6, m, n)
     elif not (q >= 0 and q <= min(m, n)):
         raise ValueError(
-            'q(={}) must be non-negative integer'
-            ' and not greater than min(m, n)={}'.format(q, min(m, n))
+            f'q(={q}) must be non-negative integer'
+            f' and not greater than min(m, n)={min(m, n)}'
         )
     if not (niter >= 0):
         raise ValueError(f'niter(={niter}) must be non-negative integer')
diff --git a/python/paddle/static/amp/bf16/amp_utils.py b/python/paddle/static/amp/bf16/amp_utils.py
index 071328435e939..bb8d62d85b8cd 100644
--- a/python/paddle/static/amp/bf16/amp_utils.py
+++ b/python/paddle/static/amp/bf16/amp_utils.py
@@ -341,16 +341,12 @@ def cast_model_to_bf16(
                         in_var = block.var(in_var_name)
                     except ValueError as e:
                         _logger.debug(
-                            "-- {}, try to get it in the global block --".format(
-                                e
-                            )
+                            f"-- {e}, try to get it in the global block --"
                         )
                         in_var = global_block.var(in_var_name)
                         if in_var is not None:
                             _logger.debug(
-                                "-- var {} is got in the global block --".format(
-                                    in_var_name
-                                )
+                                f"-- var {in_var_name} is got in the global block --"
                             )
 
                     if in_var is None or in_var.type not in _valid_types:
@@ -379,16 +375,12 @@ def cast_model_to_bf16(
                         out_var = block.var(out_var_name)
                     except ValueError as e:
                         _logger.debug(
-                            "-- {}, try to get it in the global block --".format(
-                                e
-                            )
+                            f"-- {e}, try to get it in the global block --"
                         )
                         out_var = global_block.var(out_var_name)
                         if out_var is not None:
                             _logger.debug(
-                                "-- var {} is got in the global block --".format(
-                                    out_var_name
-                                )
+                                f"-- var {out_var_name} is got in the global block --"
                             )
 
                     if out_var is None or out_var.type not in _valid_types:
diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py
index 5119425348b42..77a522b0c293b 100644
--- a/python/paddle/static/amp/fp16_utils.py
+++ b/python/paddle/static/amp/fp16_utils.py
@@ -492,9 +492,7 @@ def get_promote_dtype(op, amp_dtype, block):
         # for ipu, all inputs must be converted to fp16
         if not core.is_compiled_with_ipu() and _keep_fp32_input(op, in_name):
             _logger.debug(
-                "---- Input {} {} should be kept fp32 ----".format(
-                    in_name, op.input(in_name)
-                )
+                f"---- Input {in_name} {op.input(in_name)} should be kept fp32 ----"
             )
             continue
         # if this op has inputs
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 27682416f8c1b..8f68f3f9e89bf 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -1304,9 +1304,7 @@ def load_vars(
             var_path = os.path.join(dirname, new_var.name)
             if not os.path.exists(var_path):
                 raise ValueError(
-                    "SelectedRows var {} can not find at {}".format(
-                        new_var.name, var_path
-                    )
+                    f"SelectedRows var {new_var.name} can not find at {var_path}"
                 )
 
             if os.path.isfile(var_path):
@@ -1441,9 +1439,7 @@ def save(program, model_path, protocol=4, **configs):
 
     if not isinstance(protocol, int):
         raise ValueError(
-            "The 'protocol' MUST be `int`, but received {}".format(
-                type(protocol)
-            )
+            f"The 'protocol' MUST be `int`, but received {type(protocol)}"
         )
 
     if protocol < 2 or protocol > 4:
@@ -1677,9 +1673,7 @@ def set_var(var, ndarray):
     for v in parameter_list:
         assert (
             v.name in load_dict
-        ), "Can not find [{}] in model file [{}]".format(
-            v.name, parameter_file_name
-        )
+        ), f"Can not find [{v.name}] in model file [{parameter_file_name}]"
         set_var(v, load_dict[v.name])
 
     optimizer_var_list = list(
@@ -1702,9 +1696,7 @@ def set_var(var, ndarray):
         for v in optimizer_var_list:
             assert (
                 v.name in load_dict
-            ), "Can not find [{}] in model file [{}]".format(
-                v.name, opt_file_name
-            )
+            ), f"Can not find [{v.name}] in model file [{opt_file_name}]"
             set_var(v, load_dict[v.name])
 
 
@@ -1753,9 +1745,7 @@ def set_program_state(program, state_dict):
         var_temp = paddle.base.global_scope().find_var(para.name)
         assert (
             var_temp is not None
-        ), "Variable [ {} ] Not found, Please make sure run startup program".format(
-            para.name
-        )
+        ), f"Variable [ {para.name} ] Not found, Please make sure run startup program"
         if para.name in state_dict:
             # set value from state dict
             orig_para_np = np.array(var_temp.get_tensor())
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index e0c189b11e356..21b5b56729b86 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -901,8 +901,7 @@ def conv2d(
     )
     if len(input.shape) != 4:
         raise ValueError(
-            "Input size should be 4, "
-            "but received {}".format(len(input.shape))
+            "Input size should be 4, " f"but received {len(input.shape)}"
         )
     num_channels = input.shape[1]
     if not isinstance(use_cudnn, bool):
@@ -931,7 +930,7 @@ def conv2d(
     elif groups <= 0:
         raise ValueError(
             "the groups of input must be greater than 0, "
-            "but received the groups of input is {}".format(groups)
+            f"but received the groups of input is {groups}"
         )
     else:
         if num_channels % groups != 0:
@@ -1020,8 +1019,8 @@ def _get_default_param_initializer():
         if filter_elem_num <= 0:
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
-                " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num)
+                f" received {filter_elem_num}, please check the input shape and "
+                "filter size."
             )
         std = (2.0 / filter_elem_num) ** 0.5
         return Normal(0.0, std)
@@ -1246,9 +1245,7 @@ def conv3d(
         num_filter_channels = num_channels
     elif groups <= 0:
         raise ValueError(
-            "the groups of conv3d should be greater than 0. Received groups: {}".format(
-                groups
-            )
+            f"the groups of conv3d should be greater than 0. Received groups: {groups}"
         )
     else:
         if num_channels % groups != 0:
@@ -1325,8 +1322,8 @@ def _get_default_param_initializer():
         if filter_elem_num <= 0:
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
-                " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num)
+                f" received {filter_elem_num}, please check the input shape and "
+                "filter size."
             )
 
         std = (2.0 / filter_elem_num) ** 0.5
@@ -1554,8 +1551,7 @@ def conv2d_transpose(
     ), "param_attr should not be False in conv2d_transpose."
     if len(input.shape) != 4:
         raise ValueError(
-            "Input size should be 4, "
-            "but received {}".format(len(input.shape))
+            "Input size should be 4, " f"but received {len(input.shape)}"
         )
 
     if num_filters == 0:
@@ -1712,7 +1708,7 @@ def _update_padding(padding, data_format):
     elif groups <= 0:
         raise ValueError(
             "the groups of input must be greater than 0, "
-            "but received the groups of input is {}".format(groups)
+            f"but received the groups of input is {groups}"
         )
 
     filter_shape = [input_channel, num_filters // groups] + filter_size
@@ -2075,9 +2071,7 @@ def _update_padding(padding, data_format):
     if num_filters % groups != 0:
         raise ValueError(
             "Attr(num_filters) must be divisible by groups,"
-            "Received: Attr(num_filters) is {}, the groups is {}".format(
-                num_filters, groups
-            )
+            f"Received: Attr(num_filters) is {num_filters}, the groups is {groups}"
         )
 
     filter_shape = [input_channel, num_filters // groups] + filter_size
@@ -2303,8 +2297,8 @@ def _get_default_param_initializer():
         if filter_elem_num <= 0:
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
-                " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num)
+                f" received {filter_elem_num}, please check the input shape and "
+                "filter size."
             )
         std = (2.0 / filter_elem_num) ** 0.5
         return paddle.nn.initializer.normal.Normal(0.0, std)
@@ -3010,7 +3004,7 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
         if data_format not in true_data_format:
             raise ValueError(
                 "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+                f"'NLC', 'NHWC', 'NDHWC' but receive {data_format}"
             )
 
         data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
@@ -3324,9 +3318,7 @@ def py_func(func, x, out, backward_func=None, skip_vars_in_backward_input=None):
         for v in skip_vars_in_backward_input:
             if v.name not in fwd_in_out:
                 raise ValueError(
-                    'Tensor {} is not found in forward inputs and outputs'.format(
-                        v.name
-                    )
+                    f'Tensor {v.name} is not found in forward inputs and outputs'
                 )
             backward_skip_vars.add(v.name)
 
diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py
index 87f9ae321d6f8..0d0afcc71c150 100644
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
@@ -750,14 +750,8 @@ def create_var_like(o_var):
 
 def _error_message(what, arg_name, op_name, right_value, error_value):
     error_message = (
-        "{what} of '{arg_name}' in {op_name} must be "
-        "{right_value}, but received: {error_value}.".format(
-            what=what,
-            arg_name=arg_name,
-            op_name=op_name,
-            right_value=right_value,
-            error_value=error_value,
-        )
+        f"{what} of '{arg_name}' in {op_name} must be "
+        f"{right_value}, but received: {error_value}."
     )
 
     return error_message
@@ -871,8 +865,8 @@ def _case_check_args(pred_fn_pairs, default):
 
             if not callable(fn):
                 raise TypeError(
-                    "The fn for {} of pred_fn_pairs in Op(case) must"
-                    " be callable.".format(pred.name)
+                    f"The fn for {pred.name} of pred_fn_pairs in Op(case) must"
+                    " be callable."
                 )
 
         if default is None:
diff --git a/python/paddle/static/quantization/quantization_pass.py b/python/paddle/static/quantization/quantization_pass.py
index 4387732362f9a..ac318440aca57 100644
--- a/python/paddle/static/quantization/quantization_pass.py
+++ b/python/paddle/static/quantization/quantization_pass.py
@@ -1796,9 +1796,7 @@ def apply(self, graph):
                     scale_var = self._scope.find_var(scale_name)
                     assert (
                         scale_var is not None
-                    ), "Can not find {} variable in the scope".format(
-                        scale_name
-                    )
+                    ), f"Can not find {scale_name} variable in the scope"
                     scale_value = np.array(scale_var.get_tensor())[0]
 
                     # For compatibility, we save output threshold by two methods.
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 33c46f98ef3cf..286dcd261d8fe 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -107,10 +107,8 @@ def transpose(x, perm, name=None):
             raise ValueError(
                 "Input(perm) is the permutation of dimensions of Input(x), "
                 "its length should be equal to dimensions of Input(x), "
-                "but received dimension of Input(x) is {}, "
-                "the length of Input(perm) is {}.".format(
-                    len(x.shape), len(perm)
-                )
+                f"but received dimension of Input(x) is {len(x.shape)}, "
+                f"the length of Input(perm) is {len(perm)}."
             )
         for idx, dim in enumerate(perm):
             if dim >= len(x.shape):
@@ -594,9 +592,7 @@ def p_matrix_norm(input, porder=1.0, axis=axis, keepdim=False, name=None):
             )
         else:
             raise ValueError(
-                "unspport p for p-order vector norm. except float, found {}".format(
-                    p
-                )
+                f"unspport p for p-order vector norm. except float, found {p}"
             )
     # calculate matrix norm, where axis is list with two integers
     elif isinstance(axis, list) and len(axis) == 2:
@@ -616,9 +612,7 @@ def p_matrix_norm(input, porder=1.0, axis=axis, keepdim=False, name=None):
             )
     else:
         raise ValueError(
-            "except axis type int or list (length of list <=2), found {}".format(
-                axis
-            )
+            f"except axis type int or list (length of list <=2), found {axis}"
         )
 
 
@@ -1255,7 +1249,7 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
         if fweights.min() < 0:
             raise ValueError(
                 "The value of Input(fweights) cannot be negtive, but received "
-                "min of Input(fweights) is {}.".format(fweights.min())
+                f"min of Input(fweights) is {fweights.min()}."
             )
         if not paddle.all(fweights == paddle.round(fweights.astype('float64'))):
             raise ValueError("Input(fweights) must be integer ")
@@ -1280,7 +1274,7 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
         if aweights.min() < 0:
             raise ValueError(
                 "The value of Input(aweights) cannot be negtive, but received "
-                "min of Input(aweights) is {}.".format(aweights.min())
+                f"min of Input(aweights) is {aweights.min()}."
             )
         if w is not None:
             w = w * aw
@@ -1834,9 +1828,7 @@ def __check_input(x, vec):
             vec_shape = list(vec.shape)
             if len(x_shape) != 2:
                 raise ValueError(
-                    "x should be 2-dimensional. But received x's dimention: {}".format(
-                        x_shape
-                    )
+                    f"x should be 2-dimensional. But received x's dimention: {x_shape}"
                 )
             if len(vec_shape) != 1:
                 raise ValueError(
@@ -2192,8 +2184,8 @@ def svd_lowrank(x, q=6, niter=2, M=None):
         q = min(6, m, n)
     elif not (q >= 0 and q <= min(m, n)):
         raise ValueError(
-            'q(={}) must be non-negative integer'
-            ' and not greater than min(m, n)={}'.format(q, min(m, n))
+            f'q(={q}) must be non-negative integer'
+            f' and not greater than min(m, n)={min(m, n)}'
         )
     if not (niter >= 0):
         raise ValueError(f'niter(={niter}) must be non-negative integer')
@@ -3669,8 +3661,8 @@ def cdist(
     )
     assert x_shape[-1] == y_shape[-1], (
         "The x and y must have same last dimension, "
-        "But received Input x's last dimension is {}, "
-        "Input y's last dimension is {}.\n".format(x_shape[-1], y_shape[-1])
+        f"But received Input x's last dimension is {x_shape[-1]}, "
+        f"Input y's last dimension is {y_shape[-1]}.\n"
     )
     assert p >= 0, (
         "The p must be greater than or equal to 0, "
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 2e366150d3632..777a47968e591 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -330,9 +330,7 @@ def slice(input, axes, starts, ends):
 
         else:
             raise ValueError(
-                "Input axes must be a python list or tuple, but reveived {}".format(
-                    type(axes)
-                )
+                f"Input axes must be a python list or tuple, but reveived {type(axes)}"
             )
 
         infer_flags = [1 for i in range(len(axes))]
@@ -502,10 +500,8 @@ def transpose(x, perm, name=None):
             raise ValueError(
                 "Input(perm) is the permutation of dimensions of Input(x), "
                 "its length should be equal to dimensions of Input(x), "
-                "but received dimension of Input(x) is {}, "
-                "the length of Input(perm) is {}.".format(
-                    len(x.shape), len(perm)
-                )
+                f"but received dimension of Input(x) is {len(x.shape)}, "
+                f"the length of Input(perm) is {len(perm)}."
             )
         for idx, dim in enumerate(perm):
             if dim >= len(x.shape):
@@ -553,9 +549,7 @@ def unstack(x, axis=0, num=None):
 
     """
     if not (-x.ndim <= axis < x.ndim):
-        raise ValueError(
-            '`axis` must be in the range [-{0}, {0})'.format(x.ndim)
-        )
+        raise ValueError(f'`axis` must be in the range [-{x.ndim}, {x.ndim})')
     if num is not None and (num < 0 or num > x.shape[axis]):
         raise ValueError(f'`num` must be in the range [0, {x.shape[axis]})')
     if in_dynamic_mode():
@@ -1451,15 +1445,11 @@ def rot90(x, k=1, axes=[0, 1], name=None):
     total_rot_dims = len(axes)
     if total_rot_dims != 2:
         raise ValueError(
-            "expected total rotation axes == 2, but got axes = {}".format(
-                total_rot_dims
-            )
+            f"expected total rotation axes == 2, but got axes = {total_rot_dims}"
         )
     if input_total_dims < 2:
         raise ValueError(
-            "expected total dims >= 2, but got total dims = {}".format(
-                input_total_dims
-            )
+            f"expected total dims >= 2, but got total dims = {input_total_dims}"
         )
 
     if not (axes[0] != axes[1] and abs(axes[0] - axes[1]) != input_total_dims):
@@ -2130,9 +2120,7 @@ def vsplit(x, num_or_sections, name=None):
     """
     if x.ndim < 2:
         raise ValueError(
-            "The input tensor's dimension must be greater than 1, but got {}".format(
-                x.ndim
-            )
+            f"The input tensor's dimension must be greater than 1, but got {x.ndim}"
         )
     return split(x, num_or_sections, axis=0, name=name)
 
@@ -3720,7 +3708,7 @@ def reshape_(x, shape, name=None):
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
-                " got '{}.'".format(type(shape))
+                f" got '{type(shape)}.'"
             )
 
         return out
@@ -4480,12 +4468,12 @@ def moveaxis(x, source, destination, name=None):
         if axis[0] < 0:
             assert (
                 axis[0] >= -ndim
-            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
+            ), f"'source' must be in the range of [-{ndim}, {ndim})"
             src[i] += ndim
         else:
             assert (
                 axis[0] < ndim
-            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
+            ), f"'source' must be in the range of [-{ndim}, {ndim})"
 
         assert isinstance(
             axis[1], int
@@ -4493,12 +4481,12 @@ def moveaxis(x, source, destination, name=None):
         if axis[1] < 0:
             assert (
                 axis[1] >= -ndim
-            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
+            ), f"'source' must be in the range of [-{ndim}, {ndim})"
             dst[i] += ndim
         else:
             assert (
                 axis[1] < ndim
-            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
+            ), f"'source' must be in the range of [-{ndim}, {ndim})"
         perm[dst[i]] = src[i]
         src_dims.remove(src[i])
         dst_dims.remove(dst[i])
@@ -4541,13 +4529,11 @@ def moveaxis(x, source, destination, name=None):
 def non_negative_axis(arr, axis):
     ndim = len(arr.shape)
     if axis >= 0:
-        assert (
-            axis < ndim
-        ), "'axis'  must be in the range of [-{0}, {0})".format(ndim)
+        assert axis < ndim, f"'axis'  must be in the range of [-{ndim}, {ndim})"
     else:
         assert (
             axis >= -ndim
-        ), "'axis'  must be in the range of [-{0}, {0})".format(ndim)
+        ), f"'axis'  must be in the range of [-{ndim}, {ndim})"
         axis += ndim
 
     return axis
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ce359b732e2c0..d772c890fbb8c 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -5770,9 +5770,7 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     """
     if n < 1:
         raise ValueError(
-            "Diff expects input to be at least one-dimensional but got {}".format(
-                n
-            )
+            f"Diff expects input to be at least one-dimensional but got {n}"
         )
 
     def _diff_handler(x, n=1, axis=-1, prepend=None, append=None, name=None):
@@ -6224,17 +6222,13 @@ def take(x, index, mode='raise', name=None):
     """
     if mode not in ['raise', 'wrap', 'clip']:
         raise ValueError(
-            "'mode' in 'take' should be 'raise', 'wrap', 'clip', but received {}.".format(
-                mode
-            )
+            f"'mode' in 'take' should be 'raise', 'wrap', 'clip', but received {mode}."
         )
 
     if in_dynamic_mode():
         if not isinstance(index, (paddle.Tensor, Variable)):
             raise TypeError(
-                "The type of 'index' must be Tensor, but got {}".format(
-                    type(index)
-                )
+                f"The type of 'index' must be Tensor, but got {type(index)}"
             )
         if index.dtype not in [paddle.int32, paddle.int64]:
             raise TypeError(
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index a9b5740b2fc30..6e173545a2767 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -126,15 +126,15 @@ def _format_item(np_var, max_width=0, signed=False):
         or np_var.dtype == np.float16
     ):
         if DEFAULT_PRINT_OPTIONS.sci_mode:
-            item_str = '{{:.{}e}}'.format(
-                DEFAULT_PRINT_OPTIONS.precision
-            ).format(np_var)
+            item_str = f'{{:.{DEFAULT_PRINT_OPTIONS.precision}e}}'.format(
+                np_var
+            )
         elif np.ceil(np_var) == np_var:
             item_str = f'{np_var:.0f}.'
         else:
-            item_str = '{{:.{}f}}'.format(
-                DEFAULT_PRINT_OPTIONS.precision
-            ).format(np_var)
+            item_str = f'{{:.{DEFAULT_PRINT_OPTIONS.precision}f}}'.format(
+                np_var
+            )
     else:
         item_str = f'{np_var}'
 
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 4ea6c9ad591d6..ddf69e9fa373b 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -650,9 +650,7 @@ def get_ext_filename(self, fullname):
         if self.no_python_abi_suffix:
             assert (
                 len(name_items) > 2
-            ), "Expected len(name_items) > 2, but received {}".format(
-                len(name_items)
-            )
+            ), f"Expected len(name_items) > 2, but received {len(name_items)}"
             name_items.pop(-2)
             ext_name = split_str.join(name_items)
 
@@ -909,9 +907,7 @@ def load(
         extra_cuda_cflags = []
     assert isinstance(
         extra_cxx_cflags, list
-    ), "Required type(extra_cxx_cflags) == list[str], but received {}".format(
-        extra_cxx_cflags
-    )
+    ), f"Required type(extra_cxx_cflags) == list[str], but received {extra_cxx_cflags}"
     assert isinstance(
         extra_cuda_cflags, list
     ), "Required type(extra_cuda_cflags) == list[str], but received {}".format(
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index df2fd45b345ea..cb50f73d8d9b5 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -497,8 +497,8 @@ def _reset_so_rpath(so_path):
     if OS_NAME.startswith("darwin"):
         origin_runtime_path = "@loader_path/../libs/"
         rpath = f"@rpath/{_get_core_name()}"
-        cmd = 'install_name_tool -change {} {} {}'.format(
-            origin_runtime_path, rpath, so_path
+        cmd = (
+            f'install_name_tool -change {origin_runtime_path} {rpath} {so_path}'
         )
 
         run_cmd(cmd)
@@ -512,9 +512,9 @@ def _get_include_dirs_when_compiling(compile_dir):
     include_dirs_file = 'includes.txt'
     path = os.path.abspath(compile_dir)
     include_dirs_file = os.path.join(path, include_dirs_file)
-    assert os.path.isfile(include_dirs_file), "File {} does not exist".format(
+    assert os.path.isfile(
         include_dirs_file
-    )
+    ), f"File {include_dirs_file} does not exist"
     with open(include_dirs_file, 'r') as f:
         include_dirs = [line.strip() for line in f.readlines() if line.strip()]
 
@@ -1314,27 +1314,19 @@ def _jit_compile(file_path, verbose=False):
         py_version = subprocess.check_output([interpreter, '-V'])
         py_version = py_version.decode()
         log_v(
-            "Using Python interpreter: {}, version: {}".format(
-                interpreter, py_version.strip()
-            ),
+            f"Using Python interpreter: {interpreter}, version: {py_version.strip()}",
             verbose,
         )
     except Exception:
         _, error, _ = sys.exc_info()
         raise RuntimeError(
-            'Failed to check Python interpreter with `{}`, errors: {}'.format(
-                interpreter, error
-            )
+            f'Failed to check Python interpreter with `{interpreter}`, errors: {error}'
         )
 
     if IS_WINDOWS:
-        compile_cmd = 'cd /d {} && {} {} build'.format(
-            ext_dir, interpreter, setup_file
-        )
+        compile_cmd = f'cd /d {ext_dir} && {interpreter} {setup_file} build'
     else:
-        compile_cmd = 'cd {} && {} {} build'.format(
-            ext_dir, interpreter, setup_file
-        )
+        compile_cmd = f'cd {ext_dir} && {interpreter} {setup_file} build'
 
     print("Compiling user custom op, it will cost a few seconds.....")
     run_cmd(compile_cmd, verbose)
@@ -1437,9 +1429,7 @@ def check_abi_compatibility(compiler, verbose=False):
         # check compiler version failed
         _, error, _ = sys.exc_info()
         warnings.warn(
-            'Failed to check compiler version for {}: {}'.format(
-                compiler, error
-            )
+            f'Failed to check compiler version for {compiler}: {error}'
         )
         return False
 
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index c482484dce6ab..873c6b3a6a9fc 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -55,7 +55,7 @@ def decorator(func):
         assert isinstance(reason, str), 'type of "reason" must be str.'
         assert isinstance(level, int) and level >= 0 and level < 3, (
             'type of "level" must be int and must be one of 0, 1, 2. But '
-            'received: {}.'.format(level)
+            f'received: {level}.'
         )
 
         _since = since.strip()
@@ -92,9 +92,7 @@ def wrapper(*args, **kwargs):
 
             if level == 2:
                 raise RuntimeError(
-                    'API "{}.{}" has been deprecated.'.format(
-                        func.__module__, func.__name__
-                    )
+                    f'API "{func.__module__}.{func.__name__}" has been deprecated.'
                 )
 
             warningmsg = "\033[93m\nWarning:\n%s \033[0m" % (msg)
diff --git a/python/paddle/utils/dlpack.py b/python/paddle/utils/dlpack.py
index ed196beedb356..3e291b438502f 100644
--- a/python/paddle/utils/dlpack.py
+++ b/python/paddle/utils/dlpack.py
@@ -53,7 +53,7 @@ def to_dlpack(x):
         if not isinstance(x, (paddle.Tensor, paddle.base.core.eager.Tensor)):
             raise TypeError(
                 "The type of 'x' in to_dlpack must be paddle.Tensor,"
-                " but received {}.".format(type(x))
+                f" but received {type(x)}."
             )
 
         return x.value().get_tensor()._to_dlpack()
@@ -94,7 +94,7 @@ def from_dlpack(dlpack):
     if not dlpack_flag:
         raise TypeError(
             "The type of 'dlpack' in from_dlpack must be PyCapsule object,"
-            " but received {}.".format(type(dlpack))
+            f" but received {type(dlpack)}."
         )
 
     if in_dygraph_mode():
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index 95fb9b539a7a5..59efb656f6691 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -38,9 +38,7 @@ def update(self, n):
             if self.total is None:
                 sys.stderr.write(f"\r{self.n:.1f} bytes")
             else:
-                sys.stderr.write(
-                    "\r{:.1f}%".format(100 * self.n / float(self.total))
-                )
+                sys.stderr.write(f"\r{100 * self.n / float(self.total):.1f}%")
             sys.stderr.flush()
 
         def __enter__(self):
@@ -172,8 +170,8 @@ def _get_download(url, fullname):
         ) as req:
             if req.status_code != 200:
                 raise RuntimeError(
-                    "Downloading from {} failed with code "
-                    "{}!".format(url, req.status_code)
+                    f"Downloading from {url} failed with code "
+                    f"{req.status_code}!"
                 )
 
             tmp_fullname = fullname + "_tmp"
@@ -193,9 +191,7 @@ def _get_download(url, fullname):
 
     except Exception as e:  # requests.exceptions.ConnectionError
         logger.info(
-            "Downloading {} from {} failed with exception {}".format(
-                fname, url, str(e)
-            )
+            f"Downloading {fname} from {url} failed with exception {str(e)}"
         )
         return False
 
@@ -204,9 +200,7 @@ def _wget_download(url, fullname):
     # using wget to download url
     tmp_fullname = fullname + "_tmp"
     # –user-agent
-    command = 'wget -O {} -t {} {}'.format(
-        tmp_fullname, DOWNLOAD_RETRY_LIMIT, url
-    )
+    command = f'wget -O {tmp_fullname} -t {DOWNLOAD_RETRY_LIMIT} {url}'
     subprc = subprocess.Popen(
         command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
@@ -214,9 +208,7 @@ def _wget_download(url, fullname):
 
     if subprc.returncode != 0:
         raise RuntimeError(
-            '{} failed. Please make sure `wget` is installed or {} exists'.format(
-                command, url
-            )
+            f'{command} failed. Please make sure `wget` is installed or {url} exists'
         )
 
     shutil.move(tmp_fullname, fullname)
@@ -240,9 +232,7 @@ def _download(url, path, md5sum=None, method='get'):
     method (str): which download method to use. Support `wget` and `get`. Default is `get`.
 
     """
-    assert method in _download_methods, 'make sure `{}` implemented'.format(
-        method
-    )
+    assert method in _download_methods, f'make sure `{method}` implemented'
 
     if not osp.exists(path):
         os.makedirs(path)
@@ -258,7 +248,7 @@ def _download(url, path, md5sum=None, method='get'):
             retry_cnt += 1
         else:
             raise RuntimeError(
-                "Download from {} failed. " "Retry limit reached".format(url)
+                f"Download from {url} failed. " "Retry limit reached"
             )
 
         if not _download_methods[method](url, fullname):
@@ -281,8 +271,8 @@ def _md5check(fullname, md5sum=None):
 
     if calc_md5sum != md5sum:
         logger.info(
-            "File {} md5 check failed, {}(calc) != "
-            "{}(base)".format(fullname, calc_md5sum, md5sum)
+            f"File {fullname} md5 check failed, {calc_md5sum}(calc) != "
+            f"{md5sum}(base)"
         )
         return False
     return True
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 184b49c05d2c4..4974eddbfa26c 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -60,7 +60,7 @@ def _is_cuda_available():
         logging.warning(
             "You are using GPU version PaddlePaddle, but there is no GPU "
             "detected on your machine. Maybe CUDA devices is not set properly."
-            "\n Original Error is {}".format(e)
+            f"\n Original Error is {e}"
         )
         return False
 
@@ -76,7 +76,7 @@ def _is_xpu_available():
         logging.warning(
             "You are using XPU version PaddlePaddle, but there is no XPU "
             "detected on your machine. Maybe XPU devices is not set properly."
-            "\n Original Error is {}".format(e)
+            f"\n Original Error is {e}"
         )
         return False
 
@@ -281,11 +281,7 @@ def run_check():
 
                 os.environ['PADDLE_DISTRI_BACKEND'] = "xccl"
             _run_parallel(device_list)
-            print(
-                "PaddlePaddle works well on {} {}s.".format(
-                    device_count, device_str
-                )
-            )
+            print(f"PaddlePaddle works well on {device_count} {device_str}s.")
         print(
             "PaddlePaddle is installed successfully! Let's start deep learning with PaddlePaddle now."
         )
@@ -302,9 +298,7 @@ def run_check():
 
         logging.warning(f"\n Original Error is: {e}")
         print(
-            "PaddlePaddle is installed successfully ONLY for single {}! "
-            "Let's start deep learning with PaddlePaddle now.".format(
-                device_str
-            )
+            f"PaddlePaddle is installed successfully ONLY for single {device_str}! "
+            "Let's start deep learning with PaddlePaddle now."
         )
         raise e
diff --git a/python/paddle/utils/layers_utils.py b/python/paddle/utils/layers_utils.py
index 4b8da4ee84249..e90273f3e82ad 100644
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
@@ -289,7 +289,7 @@ def _recursive_assert_same_structure(nest1, nest2, check_types):
     if is_sequence_nest1 != is_sequence(nest2):
         raise ValueError(
             "The two structures don't have the same nested structure.\n\n"
-            "First structure: {}\n\nSecond structure: {}.".format(nest1, nest2)
+            f"First structure: {nest1}\n\nSecond structure: {nest2}."
         )
     if not is_sequence_nest1:
         return  # finished checking
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index a787f1479a28a..b8f40a232d70a 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -121,9 +121,7 @@ def __init__(
             backend = paddle.vision.get_image_backend()
         if backend not in ['pil', 'cv2']:
             raise ValueError(
-                "Expected backend are one of ['pil', 'cv2'], but got {}".format(
-                    backend
-                )
+                f"Expected backend are one of ['pil', 'cv2'], but got {backend}"
             )
         self.backend = backend
 
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 94ff24850a8ef..07926d3b9a10b 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -125,9 +125,7 @@ def __init__(
             backend = paddle.vision.get_image_backend()
         if backend not in ['pil', 'cv2']:
             raise ValueError(
-                "Expected backend are one of ['pil', 'cv2'], but got {}".format(
-                    backend
-                )
+                f"Expected backend are one of ['pil', 'cv2'], but got {backend}"
             )
         self.backend = backend
 
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index bfd20b66c8df9..c44d86ac771d6 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -119,9 +119,7 @@ def __init__(
             backend = paddle.vision.get_image_backend()
         if backend not in ['pil', 'cv2']:
             raise ValueError(
-                "Expected backend are one of ['pil', 'cv2'], but got {}".format(
-                    backend
-                )
+                f"Expected backend are one of ['pil', 'cv2'], but got {backend}"
             )
         self.backend = backend
 
diff --git a/python/paddle/vision/datasets/voc2012.py b/python/paddle/vision/datasets/voc2012.py
index 18be80408368f..f421137c7e980 100644
--- a/python/paddle/vision/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
@@ -122,9 +122,7 @@ def __init__(
             backend = paddle.vision.get_image_backend()
         if backend not in ['pil', 'cv2']:
             raise ValueError(
-                "Expected backend are one of ['pil', 'cv2'], but got {}".format(
-                    backend
-                )
+                f"Expected backend are one of ['pil', 'cv2'], but got {backend}"
             )
         self.backend = backend
 
diff --git a/python/paddle/vision/image.py b/python/paddle/vision/image.py
index e8dfd1d801355..fd46ba2953216 100644
--- a/python/paddle/vision/image.py
+++ b/python/paddle/vision/image.py
@@ -83,9 +83,7 @@ def set_image_backend(backend):
     global _image_backend
     if backend not in ['pil', 'cv2', 'tensor']:
         raise ValueError(
-            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}".format(
-                backend
-            )
+            f"Expected backend are one of ['pil', 'cv2', 'tensor'], but got {backend}"
         )
     _image_backend = backend
 
@@ -156,9 +154,7 @@ def image_load(path, backend=None):
         backend = _image_backend
     if backend not in ['pil', 'cv2', 'tensor']:
         raise ValueError(
-            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}".format(
-                backend
-            )
+            f"Expected backend are one of ['pil', 'cv2', 'tensor'], but got {backend}"
         )
 
     if backend == 'pil':
diff --git a/python/paddle/vision/models/densenet.py b/python/paddle/vision/models/densenet.py
index 90346d4ff7493..f3332fabe3f7e 100644
--- a/python/paddle/vision/models/densenet.py
+++ b/python/paddle/vision/models/densenet.py
@@ -245,9 +245,7 @@ def __init__(
         supported_layers = [121, 161, 169, 201, 264]
         assert (
             layers in supported_layers
-        ), "supported layers are {} but input layer is {}".format(
-            supported_layers, layers
-        )
+        ), f"supported layers are {supported_layers} but input layer is {layers}"
         densenet_spec = {
             121: (64, 32, [6, 12, 24, 16]),
             161: (96, 48, [6, 12, 36, 24]),
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
index a35058c9243f0..8c392ee5a696b 100644
--- a/python/paddle/vision/models/mobilenetv3.py
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -102,9 +102,7 @@ def __init__(
             self.activation_layer = nn.Hardswish
         else:
             raise RuntimeError(
-                "The activation function is not supported: {}".format(
-                    activation
-                )
+                f"The activation function is not supported: {activation}"
             )
         self.stride = stride
 
diff --git a/python/paddle/vision/models/squeezenet.py b/python/paddle/vision/models/squeezenet.py
index 9133a19993421..5e18a0b6ec459 100644
--- a/python/paddle/vision/models/squeezenet.py
+++ b/python/paddle/vision/models/squeezenet.py
@@ -115,9 +115,7 @@ def __init__(self, version, num_classes=1000, with_pool=True):
         supported_versions = ['1.0', '1.1']
         assert (
             version in supported_versions
-        ), "supported versions are {} but input version is {}".format(
-            supported_versions, version
-        )
+        ), f"supported versions are {supported_versions} but input version is {version}"
 
         if self.version == "1.0":
             self._conv = Conv2D(
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index c22ff07e922ed..06cd33419b65b 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -803,9 +803,7 @@ def resize(img, size, interpolation='bilinear', data_format='CHW'):
         # We should consider to support this case in future.
         if w <= 0 or h <= 0:
             raise NotImplementedError(
-                "Not support while w<=0 or h<=0, but received w={}, h={}".format(
-                    w, h
-                )
+                f"Not support while w<=0 or h<=0, but received w={w}, h={h}"
             )
         if (w <= h and w == size) or (h <= w and h == size):
             return img
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index eebccb4c8decf..23f4645f9e2ed 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -53,9 +53,7 @@ def _check_input(
     if isinstance(value, numbers.Number):
         if value < 0:
             raise ValueError(
-                "If {} is a single number, it must be non negative.".format(
-                    name
-                )
+                f"If {name} is a single number, it must be non negative."
             )
         value = [center - value, center + value]
         if clip_first_on_zero:
@@ -65,9 +63,7 @@ def _check_input(
             raise ValueError(f"{name} values should be between {bound}")
     else:
         raise TypeError(
-            "{} should be a single number or a list/tuple with lenght 2.".format(
-                name
-            )
+            f"{name} should be a single number or a list/tuple with lenght 2."
         )
 
     if value[0] == value[1] == center:
@@ -113,8 +109,8 @@ def __call__(self, data):
             except Exception as e:
                 stack_info = traceback.format_exc()
                 print(
-                    "fail to perform transform [{}] with error: "
-                    "{} and stack:\n{}".format(f, e, str(stack_info))
+                    f"fail to perform transform [{f}] with error: "
+                    f"{e} and stack:\n{str(stack_info)}"
                 )
                 raise e
         return data
diff --git a/test/book/test_word2vec_book.py b/test/book/test_word2vec_book.py
index bfa4a05b5e160..3c8a879bbc74b 100644
--- a/test/book/test_word2vec_book.py
+++ b/test/book/test_word2vec_book.py
@@ -35,9 +35,7 @@ def get_place(target):
         return base.CPUPlace()
     else:
         raise ValueError(
-            "Target `{}` is not on the support list: `cuda`, `xpu` and `cpu`.".format(
-                target
-            )
+            f"Target `{target}` is not on the support list: `cuda`, `xpu` and `cpu`."
         )
 
 
diff --git a/test/cinn/op_mappers/op_mapper_test.py b/test/cinn/op_mappers/op_mapper_test.py
index 584c26488484a..d77a1b4dc7bf0 100644
--- a/test/cinn/op_mappers/op_mapper_test.py
+++ b/test/cinn/op_mappers/op_mapper_test.py
@@ -166,16 +166,12 @@ def __check_valid(self):
             self.assertEqual(
                 var.shape,
                 self.feed_data[name].shape,
-                msg="The shape of input {} in feed_data is error".format(
-                    var.name
-                ),
+                msg=f"The shape of input {var.name} in feed_data is error",
             )
             self.assertEqual(
                 self.paddleddtype2nptype(var.dtype),
                 str(self.feed_data[name].dtype),
-                msg="The dtype of input {} in feed_data is error".format(
-                    var.name
-                ),
+                msg=f"The dtype of input {var.name} in feed_data is error",
             )
 
         for out_name, in_name in self.inplace_outputs.items():
diff --git a/test/cinn/passes/pass_test.py b/test/cinn/passes/pass_test.py
index 099d5b2a07fec..b8a64ce00963d 100644
--- a/test/cinn/passes/pass_test.py
+++ b/test/cinn/passes/pass_test.py
@@ -64,9 +64,7 @@ def get_pass_outputs(self, passes):
             self.assertIn(
                 var.name(),
                 self.feed_data,
-                msg="Cannot found input data {} in self.feed_data".format(
-                    var.name()
-                ),
+                msg=f"Cannot found input data {var.name()} in self.feed_data",
             )
             feed_list.append(self.feed_data[var.name()])
 
@@ -95,9 +93,7 @@ def check_pass_outputs(
         logger.debug(f"Pass after base pass optimize has {base_pass_size} ops")
         test_pass_size = self.get_pass_size(base_passes + test_passes)
         logger.debug(
-            "Pass after base and test pass optimize has {} ops".format(
-                test_pass_size
-            )
+            f"Pass after base and test pass optimize has {test_pass_size} ops"
         )
         self.assertEqual(
             base_pass_size - test_pass_size,
diff --git a/test/cinn/test_paddle_model_convertor.py b/test/cinn/test_paddle_model_convertor.py
index 5e696785fb50f..b143c2dff1c9e 100644
--- a/test/cinn/test_paddle_model_convertor.py
+++ b/test/cinn/test_paddle_model_convertor.py
@@ -166,9 +166,7 @@ def load_paddle_program(self):
         logger.debug(msg=f"Param List: {self.param_vars.keys()}")
         logger.debug(msg=f"Feed List: {self.feed_names}")
         logger.debug(
-            msg="Fetch List: {}".format(
-                [var.name for var in self.fetch_targets]
-            )
+            msg=f"Fetch List: {[var.name for var in self.fetch_targets]}"
         )
 
         self.feed_shapes = []
diff --git a/test/collective/fleet/parallel_dygraph_se_resnext.py b/test/collective/fleet/parallel_dygraph_se_resnext.py
index e9ee2407a3346..eaf8505360fb2 100644
--- a/test/collective/fleet/parallel_dygraph_se_resnext.py
+++ b/test/collective/fleet/parallel_dygraph_se_resnext.py
@@ -215,9 +215,7 @@ def __init__(self, layers=50, class_dim=102):
         supported_layers = [50, 101, 152]
         assert (
             layers in supported_layers
-        ), "supported layers are {} but input layer is {}".format(
-            supported_layers, layers
-        )
+        ), f"supported layers are {supported_layers} but input layer is {layers}"
 
         if layers == 50:
             cardinality = 32
diff --git a/test/collective/fleet/test_parallel_dygraph_pp_adaptor.py b/test/collective/fleet/test_parallel_dygraph_pp_adaptor.py
index 4551887370b3b..bee93e11115d3 100644
--- a/test/collective/fleet/test_parallel_dygraph_pp_adaptor.py
+++ b/test/collective/fleet/test_parallel_dygraph_pp_adaptor.py
@@ -64,14 +64,10 @@ def check_converted_model(converted_model_dir, expected_model_dir):
             # expected model, which does not hinder model recovering
             for i in range(p_config1.pp):
                 sub_converted_model_dir = (
-                    "{}/mp_00_sharding_00_pp_{:0>2d}".format(
-                        converted_model_dir, i
-                    )
+                    f"{converted_model_dir}/mp_00_sharding_00_pp_{i:0>2d}"
                 )
                 sub_expected_model_dir = (
-                    "{}/mp_00_sharding_00_pp_{:0>2d}".format(
-                        expected_model_dir, i
-                    )
+                    f"{expected_model_dir}/mp_00_sharding_00_pp_{i:0>2d}"
                 )
                 print(
                     f"converted_model_dir: {sub_converted_model_dir}; expected_model_dir: {sub_expected_model_dir}"
diff --git a/test/contrib/test_multi_precision_fp16_train.py b/test/contrib/test_multi_precision_fp16_train.py
index 137f2269173f3..b85f1547cfe6e 100644
--- a/test/contrib/test_multi_precision_fp16_train.py
+++ b/test/contrib/test_multi_precision_fp16_train.py
@@ -217,22 +217,14 @@ def do_test(use_nesterov=False, optimizer=""):
             else:
                 suffix = "with Nesterov" if use_nesterov else "without Nesterov"
             with self.scope_prog_guard():
-                print(
-                    "-----------------FP16 Train {}-----------------".format(
-                        suffix
-                    )
-                )
+                print(f"-----------------FP16 Train {suffix}-----------------")
                 train_loss_fp16, test_loss_fp16 = train(
                     use_pure_fp16=True,
                     use_nesterov=use_nesterov,
                     optimizer=optimizer,
                 )
             with self.scope_prog_guard():
-                print(
-                    "-----------------FP32 Train {}-----------------".format(
-                        suffix
-                    )
-                )
+                print(f"-----------------FP32 Train {suffix}-----------------")
                 train_loss_fp32, test_loss_fp32 = train(
                     use_pure_fp16=False,
                     use_nesterov=use_nesterov,
diff --git a/test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py b/test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py
index f2dec13f8a3d5..3ebe610ea0a0f 100644
--- a/test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py
+++ b/test/cpp/inference/api/full_ILSVRC2012_val_preprocess.py
@@ -174,9 +174,7 @@ def run_convert():
             retry = retry + 1
         else:
             raise RuntimeError(
-                "Can not convert the dataset to binary file with try limit {}".format(
-                    try_limit
-                )
+                f"Can not convert the dataset to binary file with try limit {try_limit}"
             )
         download_concat(cache_folder, zip_path)
         convert_Imagenet_tar2bin(zip_path, output_file)
diff --git a/test/cpp_extension/test_cpp_extension_setup.py b/test/cpp_extension/test_cpp_extension_setup.py
index 2de2dd80deac9..fae59ea689993 100644
--- a/test/cpp_extension/test_cpp_extension_setup.py
+++ b/test/cpp_extension/test_cpp_extension_setup.py
@@ -33,9 +33,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         # install general extension
         # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && {} cpp_extension_setup.py install'.format(
-            cur_dir, sys.executable
-        )
+        cmd = f'cd {cur_dir} && {sys.executable} cpp_extension_setup.py install'
         run_cmd(cmd)
 
         site_dir = site.getsitepackages()[0]
diff --git a/test/cpp_extension/test_mixed_extension_setup.py b/test/cpp_extension/test_mixed_extension_setup.py
index 92aceff5067af..26c9dcbed81f7 100644
--- a/test/cpp_extension/test_mixed_extension_setup.py
+++ b/test/cpp_extension/test_mixed_extension_setup.py
@@ -169,9 +169,7 @@ def _test_static(self):
             np.testing.assert_array_equal(
                 out,
                 pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
+                err_msg=f'custom op out: {out},\n paddle api out: {pd_out}',
             )
 
     def _test_dynamic(self):
@@ -188,16 +186,12 @@ def _test_dynamic(self):
             np.testing.assert_array_equal(
                 out,
                 pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
+                err_msg=f'custom op out: {out},\n paddle api out: {pd_out}',
             )
             np.testing.assert_array_equal(
                 x_grad,
                 pd_x_grad,
-                err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
-                    x_grad, pd_x_grad
-                ),
+                err_msg=f'custom op x grad: {x_grad},\n paddle api x grad: {pd_x_grad}',
             )
 
     def _test_double_grad_dynamic(self):
@@ -214,9 +208,7 @@ def _test_double_grad_dynamic(self):
             np.testing.assert_array_equal(
                 out,
                 pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
+                err_msg=f'custom op out: {out},\n paddle api out: {pd_out}',
             )
             np.testing.assert_array_equal(
                 dx_grad,
diff --git a/test/cpp_extension/utils.py b/test/cpp_extension/utils.py
index 19659c6d5d716..be19ccb518f4a 100644
--- a/test/cpp_extension/utils.py
+++ b/test/cpp_extension/utils.py
@@ -59,9 +59,7 @@ def check_output(out, pd_out, name):
         np.testing.assert_array_equal(
             out,
             pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
+            err_msg=f'custom op {name}: {out},\n paddle api {name}: {pd_out}',
         )
 
 
@@ -75,7 +73,5 @@ def check_output_allclose(out, pd_out, name, rtol=5e-5, atol=1e-2):
         pd_out,
         rtol,
         atol,
-        err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-            name, out, name, pd_out
-        ),
+        err_msg=f'custom op {name}: {out},\n paddle api {name}: {pd_out}',
     )
diff --git a/test/custom_kernel/test_custom_kernel_dot.py b/test/custom_kernel/test_custom_kernel_dot.py
index 44aa6aec33c05..7059af7f49e3c 100644
--- a/test/custom_kernel/test_custom_kernel_dot.py
+++ b/test/custom_kernel/test_custom_kernel_dot.py
@@ -49,9 +49,7 @@ def test_custom_kernel_dot_run(self):
         np.testing.assert_array_equal(
             out.numpy(),
             result,
-            err_msg='custom kernel dot out: {},\n numpy dot out: {}'.format(
-                out.numpy(), result
-            ),
+            err_msg=f'custom kernel dot out: {out.numpy()},\n numpy dot out: {result}',
         )
 
 
@@ -82,9 +80,7 @@ def test_custom_kernel_dot_run(self):
         np.testing.assert_array_equal(
             out.numpy(),
             result,
-            err_msg='custom kernel dot out: {},\n numpy dot out: {}'.format(
-                out.numpy(), result
-            ),
+            err_msg=f'custom kernel dot out: {out.numpy()},\n numpy dot out: {result}',
         )
 
 
diff --git a/test/custom_kernel/test_custom_kernel_load.py b/test/custom_kernel/test_custom_kernel_load.py
index f790ae9e3ed9d..a480567c5edcb 100644
--- a/test/custom_kernel/test_custom_kernel_load.py
+++ b/test/custom_kernel/test_custom_kernel_load.py
@@ -54,9 +54,7 @@ def setUp(self):
             [paddle_lib_path, '..', '..', 'paddle_custom_device']
         )
         # copy so to default path
-        cmd = 'mkdir -p {} && cp ./*.so {}'.format(
-            self.default_path, self.default_path
-        )
+        cmd = f'mkdir -p {self.default_path} && cp ./*.so {self.default_path}'
         os.system(cmd)  # wait
 
     def test_custom_kernel_dot_load(self):
@@ -75,9 +73,7 @@ def test_custom_kernel_dot_load(self):
         np.testing.assert_array_equal(
             out.numpy(),
             result,
-            err_msg='custom kernel dot out: {},\n numpy dot out: {}'.format(
-                out.numpy(), result
-            ),
+            err_msg=f'custom kernel dot out: {out.numpy()},\n numpy dot out: {result}',
         )
 
     def tearDown(self):
diff --git a/test/custom_op/test_context_pool.py b/test/custom_op/test_context_pool.py
index b8ccebc1106b4..19ac0ed49a4d3 100644
--- a/test/custom_op/test_context_pool.py
+++ b/test/custom_op/test_context_pool.py
@@ -24,9 +24,7 @@
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\context_pool_jit\\context_pool_jit.pyd'.format(
-    get_build_directory()
-)
+file = f'{get_build_directory()}\\context_pool_jit\\context_pool_jit.pyd'
 if os.name == 'nt' and os.path.isfile(file):
     cmd = f'del {file}'
     run_cmd(cmd, True)
diff --git a/test/custom_op/test_custom_attrs_jit.py b/test/custom_op/test_custom_attrs_jit.py
index 676e81c49be27..25632b8e765d0 100644
--- a/test/custom_op/test_custom_attrs_jit.py
+++ b/test/custom_op/test_custom_attrs_jit.py
@@ -24,9 +24,7 @@
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_attrs_jit\\custom_attrs_jit.pyd'.format(
-    get_build_directory()
-)
+file = f'{get_build_directory()}\\custom_attrs_jit\\custom_attrs_jit.pyd'
 if os.name == 'nt' and os.path.isfile(file):
     cmd = f'del {file}'
     run_cmd(cmd, True)
diff --git a/test/custom_op/test_custom_cast_op_jit.py b/test/custom_op/test_custom_cast_op_jit.py
index 24c344c8ad985..8e8fe12203044 100644
--- a/test/custom_op/test_custom_cast_op_jit.py
+++ b/test/custom_op/test_custom_cast_op_jit.py
@@ -30,9 +30,7 @@
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_cast_module_jit\\custom_cast_module_jit.pyd'.format(
-    get_build_directory()
-)
+file = f'{get_build_directory()}\\custom_cast_module_jit\\custom_cast_module_jit.pyd'
 if os.name == 'nt' and os.path.isfile(file):
     cmd = f'del {file}'
     run_cmd(cmd, True)
diff --git a/test/custom_op/test_custom_concat.py b/test/custom_op/test_custom_concat.py
index 4fa1bcb618db8..153ca92a46def 100644
--- a/test/custom_op/test_custom_concat.py
+++ b/test/custom_op/test_custom_concat.py
@@ -116,9 +116,7 @@ def check_output(self, out, pd_out, name):
         np.testing.assert_array_equal(
             out,
             pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
+            err_msg=f'custom op {name}: {out},\n paddle api {name}: {pd_out}',
         )
 
     def test_dynamic(self):
diff --git a/test/custom_op/test_custom_relu_op_jit.py b/test/custom_op/test_custom_relu_op_jit.py
index ecf9c6dfbc87f..95e4fab22b9c9 100644
--- a/test/custom_op/test_custom_relu_op_jit.py
+++ b/test/custom_op/test_custom_relu_op_jit.py
@@ -31,9 +31,7 @@
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
-    get_build_directory()
-)
+file = f'{get_build_directory()}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'
 if os.name == 'nt' and os.path.isfile(file):
     cmd = f'del {file}'
     run_cmd(cmd, True)
@@ -86,9 +84,7 @@ def test_static(self):
                     np.testing.assert_array_equal(
                         out,
                         pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.format(
-                            out, pd_out
-                        ),
+                        err_msg=f'custom op out: {out},\n paddle api out: {pd_out}',
                     )
 
     def test_dynamic(self):
@@ -107,9 +103,7 @@ def test_dynamic(self):
                     np.testing.assert_array_equal(
                         out,
                         pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.format(
-                            out, pd_out
-                        ),
+                        err_msg=f'custom op out: {out},\n paddle api out: {pd_out}',
                     )
                     np.testing.assert_array_equal(
                         x_grad,
diff --git a/test/custom_op/test_custom_relu_op_setup.py b/test/custom_op/test_custom_relu_op_setup.py
index 0887f0268f959..eaa6cdc1a8c34 100644
--- a/test/custom_op/test_custom_relu_op_setup.py
+++ b/test/custom_op/test_custom_relu_op_setup.py
@@ -147,12 +147,10 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         # compile, install the custom op egg into site-packages under background
         if os.name == 'nt':
-            cmd = 'cd /d {} && python custom_relu_setup.py install'.format(
-                cur_dir
-            )
+            cmd = f'cd /d {cur_dir} && python custom_relu_setup.py install'
         else:
-            cmd = 'cd {} && {} custom_relu_setup.py install'.format(
-                cur_dir, sys.executable
+            cmd = (
+                f'cd {cur_dir} && {sys.executable} custom_relu_setup.py install'
             )
         run_cmd(cmd)
 
diff --git a/test/custom_op/test_custom_relu_op_xpu_setup.py b/test/custom_op/test_custom_relu_op_xpu_setup.py
index 967b976533ebf..bf14ed6844b1b 100644
--- a/test/custom_op/test_custom_relu_op_xpu_setup.py
+++ b/test/custom_op/test_custom_relu_op_xpu_setup.py
@@ -64,8 +64,8 @@ def custom_relu_static(
 class TestNewCustomOpXpuSetUpInstall(unittest.TestCase):
     def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
-        cmd = 'cd {} && {} custom_relu_xpu_setup.py install'.format(
-            cur_dir, sys.executable
+        cmd = (
+            f'cd {cur_dir} && {sys.executable} custom_relu_xpu_setup.py install'
         )
         run_cmd(cmd)
 
diff --git a/test/custom_op/test_custom_simple_slice.py b/test/custom_op/test_custom_simple_slice.py
index e2662e70f3bc6..166108f4fe63d 100644
--- a/test/custom_op/test_custom_simple_slice.py
+++ b/test/custom_op/test_custom_simple_slice.py
@@ -24,9 +24,7 @@
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_simple_slice\\custom_simple_slice.pyd'.format(
-    get_build_directory()
-)
+file = f'{get_build_directory()}\\custom_simple_slice\\custom_simple_slice.pyd'
 if os.name == 'nt' and os.path.isfile(file):
     cmd = f'del {file}'
     run_cmd(cmd, True)
diff --git a/test/custom_op/test_custom_tensor_operator.py b/test/custom_op/test_custom_tensor_operator.py
index f6edbd934171d..8460bd2dba95a 100644
--- a/test/custom_op/test_custom_tensor_operator.py
+++ b/test/custom_op/test_custom_tensor_operator.py
@@ -30,9 +30,7 @@
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
-file = '{}\\custom_tensor_operator\\custom_tensor_operator.pyd'.format(
-    get_build_directory()
-)
+file = f'{get_build_directory()}\\custom_tensor_operator\\custom_tensor_operator.pyd'
 if os.name == 'nt' and os.path.isfile(file):
     cmd = f'del {file}'
     run_cmd(cmd, True)
diff --git a/test/custom_op/utils.py b/test/custom_op/utils.py
index d65a0f2175f6e..c6928a0024bb8 100644
--- a/test/custom_op/utils.py
+++ b/test/custom_op/utils.py
@@ -61,9 +61,7 @@ def check_output(out, pd_out, name):
         np.testing.assert_array_equal(
             out,
             pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
+            err_msg=f'custom op {name}: {out},\n paddle api {name}: {pd_out}',
         )
 
 
@@ -77,7 +75,5 @@ def check_output_allclose(out, pd_out, name, rtol=5e-5, atol=1e-2):
         pd_out,
         rtol,
         atol,
-        err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-            name, out, name, pd_out
-        ),
+        err_msg=f'custom op {name}: {out},\n paddle api {name}: {pd_out}',
     )
diff --git a/test/custom_runtime/test_collective_process_group_xccl.py b/test/custom_runtime/test_collective_process_group_xccl.py
index 8b80b3b361f1f..4d007d726273b 100644
--- a/test/custom_runtime/test_collective_process_group_xccl.py
+++ b/test/custom_runtime/test_collective_process_group_xccl.py
@@ -167,9 +167,7 @@ def setUp(self):
         # only valid in current process
         os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
             cur_dir,
-            '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
-                self.temp_dir.name
-            ),
+            f'{self.temp_dir.name}/PaddleCustomDevice/backends/custom_cpu/build',
         )
         os.environ['FLAGS_selected_custom_cpus'] = '0,1'
         os.environ['CUSTOM_CPU_VISIBLE_DEVICES'] = '0,1'
diff --git a/test/custom_runtime/test_custom_cpu_plugin.py b/test/custom_runtime/test_custom_cpu_plugin.py
index 5dd375514057e..b92df8def9dd3 100755
--- a/test/custom_runtime/test_custom_cpu_plugin.py
+++ b/test/custom_runtime/test_custom_cpu_plugin.py
@@ -43,9 +43,7 @@ def setUp(self):
         # only valid in current process
         os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
             cur_dir,
-            '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
-                self.temp_dir.name
-            ),
+            f'{self.temp_dir.name}/PaddleCustomDevice/backends/custom_cpu/build',
         )
 
     def tearDown(self):
diff --git a/test/custom_runtime/test_custom_cpu_profiler_plugin.py b/test/custom_runtime/test_custom_cpu_profiler_plugin.py
index 2bb9e278cfb76..220c9a0a21aeb 100644
--- a/test/custom_runtime/test_custom_cpu_profiler_plugin.py
+++ b/test/custom_runtime/test_custom_cpu_profiler_plugin.py
@@ -41,9 +41,7 @@ def setUp(self):
         # only valid in current process
         os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
             cur_dir,
-            '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
-                self.temp_dir.name
-            ),
+            f'{self.temp_dir.name}/PaddleCustomDevice/backends/custom_cpu/build',
         )
 
     def tearDown(self):
diff --git a/test/custom_runtime/test_custom_cpu_to_static.py b/test/custom_runtime/test_custom_cpu_to_static.py
index c546382eb2385..60ba27004afbd 100644
--- a/test/custom_runtime/test_custom_cpu_to_static.py
+++ b/test/custom_runtime/test_custom_cpu_to_static.py
@@ -123,9 +123,7 @@ def setUp(self):
         # only valid in current process
         os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
             cur_dir,
-            '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
-                self.temp_dir.name
-            ),
+            f'{self.temp_dir.name}/PaddleCustomDevice/backends/custom_cpu/build',
         )
 
     def tearDown(self):
diff --git a/test/custom_runtime/test_custom_op_setup.py b/test/custom_runtime/test_custom_op_setup.py
index f0f5d1cb505a6..47c7d9821d6b8 100644
--- a/test/custom_runtime/test_custom_op_setup.py
+++ b/test/custom_runtime/test_custom_op_setup.py
@@ -123,9 +123,7 @@ def setUp(self):
         # only valid in current process
         os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
             self.cur_dir,
-            '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
-                self.temp_dir.name
-            ),
+            f'{self.temp_dir.name}/PaddleCustomDevice/backends/custom_cpu/build',
         )
 
         # `import paddle` loads custom_cpu.so, hence we must import paddle after finishing build PaddleCustomDevice
@@ -185,9 +183,7 @@ def _test_static(self):
             np.testing.assert_array_equal(
                 out,
                 pd_out,
-                err_msg="custom op out: {},\n paddle api out: {}".format(
-                    out, pd_out
-                ),
+                err_msg=f"custom op out: {out},\n paddle api out: {pd_out}",
             )
 
     def _test_dynamic(self):
@@ -202,16 +198,12 @@ def _test_dynamic(self):
             np.testing.assert_array_equal(
                 out,
                 pd_out,
-                err_msg="custom op out: {},\n paddle api out: {}".format(
-                    out, pd_out
-                ),
+                err_msg=f"custom op out: {out},\n paddle api out: {pd_out}",
             )
             np.testing.assert_array_equal(
                 x_grad,
                 pd_x_grad,
-                err_msg="custom op x grad: {},\n paddle api x grad: {}".format(
-                    x_grad, pd_x_grad
-                ),
+                err_msg=f"custom op x grad: {x_grad},\n paddle api x grad: {pd_x_grad}",
             )
 
     def _test_double_grad_dynamic(self):
@@ -226,9 +218,7 @@ def _test_double_grad_dynamic(self):
             np.testing.assert_array_equal(
                 out,
                 pd_out,
-                err_msg="custom op out: {},\n paddle api out: {}".format(
-                    out, pd_out
-                ),
+                err_msg=f"custom op out: {out},\n paddle api out: {pd_out}",
             )
             np.testing.assert_array_equal(
                 dx_grad,
@@ -264,9 +254,7 @@ def _test_with_dataloader(self):
             np.testing.assert_array_equal(
                 out,
                 pd_out,
-                err_msg="custom op out: {},\n paddle api out: {}".format(
-                    out, pd_out
-                ),
+                err_msg=f"custom op out: {out},\n paddle api out: {pd_out}",
             )
 
             if batch_id == 5:
diff --git a/test/distributed_passes/dist_pass_test_base.py b/test/distributed_passes/dist_pass_test_base.py
index dc8da03bd6a4f..72bc7ca78d9de 100644
--- a/test/distributed_passes/dist_pass_test_base.py
+++ b/test/distributed_passes/dist_pass_test_base.py
@@ -287,9 +287,7 @@ def apply_passes(self, main_prog, startup_prog):
         self.assertEqual(
             len(passes),
             len(new_passes),
-            "After solving conflicts, the left passes are: {}".format(
-                auto_pass_manager.names
-            ),
+            f"After solving conflicts, the left passes are: {auto_pass_manager.names}",
         )
 
         for i, (p1, p2) in enumerate(zip(passes, new_passes)):
diff --git a/test/dygraph_to_static/test_break_continue.py b/test/dygraph_to_static/test_break_continue.py
index 6c6547d579577..d3a2162dc787e 100644
--- a/test/dygraph_to_static/test_break_continue.py
+++ b/test/dygraph_to_static/test_break_continue.py
@@ -235,9 +235,7 @@ def test_transformed_static_result(self):
             dygraph_res,
             static_res,
             rtol=1e-05,
-            err_msg='dygraph res is {}\nstatic_res is {}'.format(
-                dygraph_res, static_res
-            ),
+            err_msg=f'dygraph res is {dygraph_res}\nstatic_res is {static_res}',
         )
 
 
diff --git a/test/dygraph_to_static/test_build_strategy.py b/test/dygraph_to_static/test_build_strategy.py
index 92968fabf28f5..83ed8d56751dd 100644
--- a/test/dygraph_to_static/test_build_strategy.py
+++ b/test/dygraph_to_static/test_build_strategy.py
@@ -53,17 +53,13 @@ def verify_predict(self):
             dy_jit_pre,
             st_pre,
             rtol=1e-05,
-            err_msg='dy_jit_pre:\n {}\n, st_pre: \n{}.'.format(
-                dy_jit_pre, st_pre
-            ),
+            err_msg=f'dy_jit_pre:\n {dy_jit_pre}\n, st_pre: \n{st_pre}.',
         )
         np.testing.assert_allclose(
             predictor_pre,
             st_pre,
             rtol=1e-05,
-            err_msg='predictor_pre:\n {}\n, st_pre: \n{}.'.format(
-                predictor_pre, st_pre
-            ),
+            err_msg=f'predictor_pre:\n {predictor_pre}\n, st_pre: \n{st_pre}.',
         )
 
     @ast_only_test
@@ -74,9 +70,7 @@ def test_resnet(self):
             static_loss,
             dygraph_loss,
             rtol=1e-05,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
         self.verify_predict()
 
diff --git a/test/dygraph_to_static/test_cache_program.py b/test/dygraph_to_static/test_cache_program.py
index 2b8a88245de87..0602b15b3054b 100644
--- a/test/dygraph_to_static/test_cache_program.py
+++ b/test/dygraph_to_static/test_cache_program.py
@@ -115,9 +115,7 @@ def test_with_optimizer(self):
             dygraph_loss,
             static_loss,
             rtol=1e-05,
-            err_msg='dygraph is {}\n static_res is \n{}'.format(
-                dygraph_loss, static_loss
-            ),
+            err_msg=f'dygraph is {dygraph_loss}\n static_res is \n{static_loss}',
         )
 
 
diff --git a/test/dygraph_to_static/test_cast.py b/test/dygraph_to_static/test_cast.py
index 156d25d747137..7e2b0914a5fff 100644
--- a/test/dygraph_to_static/test_cast.py
+++ b/test/dygraph_to_static/test_cast.py
@@ -103,9 +103,7 @@ def test_cast_result(self):
             res,
             ref_val,
             rtol=1e-05,
-            err_msg='The casted value is {}.\nThe correct value is {}.'.format(
-                res, ref_val
-            ),
+            err_msg=f'The casted value is {res}.\nThe correct value is {ref_val}.',
         )
 
 
@@ -176,9 +174,7 @@ def test_cast_result(self):
             res,
             ref_val,
             rtol=1e-05,
-            err_msg='The casted value is {}.\nThe correct value is {}.'.format(
-                res, ref_val
-            ),
+            err_msg=f'The casted value is {res}.\nThe correct value is {ref_val}.',
         )
 
 
@@ -198,9 +194,7 @@ def test_cast_result(self):
         ref_val = int(self.input)
         self.assertTrue(
             res == ref_val,
-            msg='The casted value is {}.\nThe correct value is {}.'.format(
-                res, ref_val
-            ),
+            msg=f'The casted value is {res}.\nThe correct value is {ref_val}.',
         )
 
 
diff --git a/test/dygraph_to_static/test_container.py b/test/dygraph_to_static/test_container.py
index 95dc0214e9786..412362ba725c5 100644
--- a/test/dygraph_to_static/test_container.py
+++ b/test/dygraph_to_static/test_container.py
@@ -114,9 +114,7 @@ def test_train(self):
             dy_out,
             st_out,
             rtol=1e-05,
-            err_msg='dygraph_res is {}\nstatic_res is {}'.format(
-                dy_out, st_out
-            ),
+            err_msg=f'dygraph_res is {dy_out}\nstatic_res is {st_out}',
         )
 
     def _test_load(self, net, x):
diff --git a/test/dygraph_to_static/test_convert_call.py b/test/dygraph_to_static/test_convert_call.py
index 79f23351cb6dd..fb6c69fc899fa 100644
--- a/test/dygraph_to_static/test_convert_call.py
+++ b/test/dygraph_to_static/test_convert_call.py
@@ -109,9 +109,7 @@ def test_transformed_static_result(self):
             dygraph_res,
             static_res,
             rtol=1e-05,
-            err_msg='dygraph res is {}\nstatic_res is {}'.format(
-                dygraph_res, static_res
-            ),
+            err_msg=f'dygraph res is {dygraph_res}\nstatic_res is {static_res}',
         )
 
 
diff --git a/test/dygraph_to_static/test_dict.py b/test/dygraph_to_static/test_dict.py
index ac92feef1d140..80180b522cf54 100644
--- a/test/dygraph_to_static/test_dict.py
+++ b/test/dygraph_to_static/test_dict.py
@@ -203,9 +203,7 @@ def test_transformed_result(self):
             dygraph_res,
             static_res,
             rtol=1e-05,
-            err_msg='dygraph result is {}\nstatic result is {}'.format(
-                dygraph_res, static_res
-            ),
+            err_msg=f'dygraph result is {dygraph_res}\nstatic result is {static_res}',
         )
 
 
@@ -247,9 +245,7 @@ def test_ast_to_func(self):
 
         self.assertTrue(
             (dygraph_result == static_result).all(),
-            msg="dygraph result: {}\nstatic result: {}".format(
-                dygraph_result, static_result
-            ),
+            msg=f"dygraph result: {dygraph_result}\nstatic result: {static_result}",
         )
 
 
diff --git a/test/dygraph_to_static/test_error.py b/test/dygraph_to_static/test_error.py
index 762859d2d38f1..8c6f74d75c4e0 100644
--- a/test/dygraph_to_static/test_error.py
+++ b/test/dygraph_to_static/test_error.py
@@ -257,9 +257,7 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 37, in func_error_in_compile_time'.format(
-                self.filepath
-            ),
+            f'File "{self.filepath}", line 37, in func_error_in_compile_time',
             'inner_func()',
             f'File "{self.filepath}", line 30, in inner_func',
             'def inner_func():',
@@ -288,9 +286,7 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 48, in func_error_in_compile_time_2'.format(
-                self.filepath
-            ),
+            f'File "{self.filepath}", line 48, in func_error_in_compile_time_2',
             'def func_error_in_compile_time_2(x):',
             'x = base.dygraph.to_variable(x)',
             'x = paddle.reshape(x, shape=[1, 2])',
@@ -338,9 +334,7 @@ def set_exception_type(self):
 
     def set_message(self):
         self.expected_message = [
-            'File "{}", line 56, in func_error_in_runtime'.format(
-                self.filepath
-            ),
+            f'File "{self.filepath}", line 56, in func_error_in_runtime',
             'x = base.dygraph.to_variable(x)',
             'two = paddle.tensor.fill_constant(shape=[1], value=2, dtype="int32")',
             'x = paddle.reshape(x, shape=[1, two])',
diff --git a/test/dygraph_to_static/test_fetch_feed.py b/test/dygraph_to_static/test_fetch_feed.py
index 5ddc1f3da24ee..0834f2ec4a315 100644
--- a/test/dygraph_to_static/test_fetch_feed.py
+++ b/test/dygraph_to_static/test_fetch_feed.py
@@ -94,9 +94,7 @@ def test_declarative(self):
             dygraph_res,
             static_res,
             rtol=1e-05,
-            err_msg='dygraph_res is {}\n static_res is \n{}'.format(
-                dygraph_res, static_res
-            ),
+            err_msg=f'dygraph_res is {dygraph_res}\n static_res is \n{static_res}',
         )
 
 
diff --git a/test/dygraph_to_static/test_lac.py b/test/dygraph_to_static/test_lac.py
index a650a25c25b07..522eb81cf5a7a 100644
--- a/test/dygraph_to_static/test_lac.py
+++ b/test/dygraph_to_static/test_lac.py
@@ -624,9 +624,7 @@ def test_train(self):
             dy_out,
             st_out,
             rtol=1e-05,
-            err_msg='dygraph output:\n{},\nstatic output:\n {}.'.format(
-                dy_out, st_out
-            ),
+            err_msg=f'dygraph output:\n{dy_out},\nstatic output:\n {st_out}.',
         )
         # Prediction needs trained models, so put `test_predict` at last of `test_train`
         # self.verify_predict()
diff --git a/test/dygraph_to_static/test_layer_hook.py b/test/dygraph_to_static/test_layer_hook.py
index 4d3fb03229859..bf679cf8dcc2e 100644
--- a/test/dygraph_to_static/test_layer_hook.py
+++ b/test/dygraph_to_static/test_layer_hook.py
@@ -93,9 +93,7 @@ def test_hook(self):
             st_out,
             dy_out,
             rtol=1e-05,
-            err_msg='dygraph_res is {}\nstatic_res is {}'.format(
-                dy_out, st_out
-            ),
+            err_msg=f'dygraph_res is {dy_out}\nstatic_res is {st_out}',
         )
         np.testing.assert_allclose(
             st_out,
diff --git a/test/dygraph_to_static/test_list.py b/test/dygraph_to_static/test_list.py
index b0febb2b0c9ee..9ad646de8818c 100644
--- a/test/dygraph_to_static/test_list.py
+++ b/test/dygraph_to_static/test_list.py
@@ -261,9 +261,7 @@ def test_transformed_static_result(self):
                     stat_res,
                     dy_res,
                     rtol=1e-05,
-                    err_msg='dygraph_res is {}\nstatic_res is {}'.format(
-                        dy_res, stat_res
-                    ),
+                    err_msg=f'dygraph_res is {dy_res}\nstatic_res is {stat_res}',
                 )
 
 
diff --git a/test/dygraph_to_static/test_logical.py b/test/dygraph_to_static/test_logical.py
index 3b00903bc478c..9e0f1d12bd9b4 100644
--- a/test/dygraph_to_static/test_logical.py
+++ b/test/dygraph_to_static/test_logical.py
@@ -206,9 +206,7 @@ def test_transformed_result(self):
             dygraph_res,
             static_res,
             rtol=1e-05,
-            err_msg='dygraph result is {}\nstatic_result is {}'.format(
-                dygraph_res, static_res
-            ),
+            err_msg=f'dygraph result is {dygraph_res}\nstatic_result is {static_res}',
         )
 
 
@@ -223,9 +221,7 @@ def test_transformed_result(self):
             dygraph_res,
             static_res,
             rtol=1e-05,
-            err_msg='dygraph result is {}\nstatic_result is {}'.format(
-                dygraph_res, static_res
-            ),
+            err_msg=f'dygraph result is {dygraph_res}\nstatic_result is {static_res}',
         )
 
 
diff --git a/test/dygraph_to_static/test_lstm.py b/test/dygraph_to_static/test_lstm.py
index 4dc5b5a0fba75..2e26a37705631 100644
--- a/test/dygraph_to_static/test_lstm.py
+++ b/test/dygraph_to_static/test_lstm.py
@@ -103,9 +103,7 @@ def test_save_in_eval(self, with_training=True):
             dygraph_out.numpy(),
             static_out.numpy(),
             rtol=1e-05,
-            err_msg='dygraph_out is {}\n static_out is \n{}'.format(
-                dygraph_out, static_out
-            ),
+            err_msg=f'dygraph_out is {dygraph_out}\n static_out is \n{static_out}',
         )
         # switch back into train mode.
         net.train()
@@ -114,9 +112,7 @@ def test_save_in_eval(self, with_training=True):
             dygraph_out.numpy(),
             train_out.numpy(),
             rtol=1e-05,
-            err_msg='dygraph_out is {}\n static_out is \n{}'.format(
-                dygraph_out, train_out
-            ),
+            err_msg=f'dygraph_out is {dygraph_out}\n static_out is \n{train_out}',
         )
 
     def test_save_without_training(self):
@@ -176,9 +172,7 @@ def test_save_in_eval(self):
             eval_out.numpy(),
             infer_out.numpy(),
             rtol=1e-05,
-            err_msg='eval_out is {}\n infer_out is \n{}'.format(
-                eval_out, infer_out
-            ),
+            err_msg=f'eval_out is {eval_out}\n infer_out is \n{infer_out}',
         )
 
 
diff --git a/test/dygraph_to_static/test_mnist.py b/test/dygraph_to_static/test_mnist.py
index d8f22e8bd1b17..9641a9225cee7 100644
--- a/test/dygraph_to_static/test_mnist.py
+++ b/test/dygraph_to_static/test_mnist.py
@@ -168,9 +168,7 @@ def test_mnist_to_static(self):
             dygraph_loss,
             static_loss,
             rtol=1e-05,
-            err_msg='dygraph is {}\n static_res is \n{}'.format(
-                dygraph_loss, static_loss
-            ),
+            err_msg=f'dygraph is {dygraph_loss}\n static_res is \n{static_loss}',
         )
 
     def test_mnist_declarative_cpu_vs_mkldnn(self):
diff --git a/test/dygraph_to_static/test_mnist_amp.py b/test/dygraph_to_static/test_mnist_amp.py
index e5e11062aad7b..3e4b9d1b11657 100644
--- a/test/dygraph_to_static/test_mnist_amp.py
+++ b/test/dygraph_to_static/test_mnist_amp.py
@@ -45,9 +45,7 @@ def test_mnist_to_static(self):
             static_loss,
             rtol=1e-05,
             atol=0.001,
-            err_msg='dygraph is {}\n static_res is \n{}'.format(
-                dygraph_loss, static_loss
-            ),
+            err_msg=f'dygraph is {dygraph_loss}\n static_res is \n{static_loss}',
         )
 
     def train(self, to_static=False):
diff --git a/test/dygraph_to_static/test_mnist_pure_fp16.py b/test/dygraph_to_static/test_mnist_pure_fp16.py
index 9d5ae58edbbd7..c1489cc6e9158 100644
--- a/test/dygraph_to_static/test_mnist_pure_fp16.py
+++ b/test/dygraph_to_static/test_mnist_pure_fp16.py
@@ -43,9 +43,7 @@ def test_mnist_to_static(self):
                 static_loss,
                 rtol=1e-05,
                 atol=0.001,
-                err_msg='dygraph is {}\n static_res is \n{}'.format(
-                    dygraph_loss, static_loss
-                ),
+                err_msg=f'dygraph is {dygraph_loss}\n static_res is \n{static_loss}',
             )
 
     def train(self, to_static=False):
diff --git a/test/dygraph_to_static/test_mobile_net.py b/test/dygraph_to_static/test_mobile_net.py
index 607d7644763de..5536a14e695c4 100644
--- a/test/dygraph_to_static/test_mobile_net.py
+++ b/test/dygraph_to_static/test_mobile_net.py
@@ -716,18 +716,14 @@ def assert_same_predict(self, model_name):
             dy_jit_pre,
             st_pre,
             rtol=1e-05,
-            err_msg='dy_jit_pre:\n {}\n, st_pre: \n{}.'.format(
-                dy_jit_pre, st_pre
-            ),
+            err_msg=f'dy_jit_pre:\n {dy_jit_pre}\n, st_pre: \n{st_pre}.',
         )
         np.testing.assert_allclose(
             predictor_pre,
             st_pre,
             rtol=1e-05,
             atol=1e-05,
-            err_msg='inference_pred_res:\n {}\n, st_pre: \n{}.'.format(
-                predictor_pre, st_pre
-            ),
+            err_msg=f'inference_pred_res:\n {predictor_pre}\n, st_pre: \n{st_pre}.',
         )
 
     @test_with_new_ir
diff --git a/test/dygraph_to_static/test_pylayer.py b/test/dygraph_to_static/test_pylayer.py
index 8191760c72a3f..88558e3d628fb 100644
--- a/test/dygraph_to_static/test_pylayer.py
+++ b/test/dygraph_to_static/test_pylayer.py
@@ -32,9 +32,7 @@ def compare_result(dygraph_res, static_res, rtol=1e-5, atol=0):
         static_res.detach().numpy(),
         rtol=rtol,
         atol=atol,
-        err_msg='dygraph result is {}\nstatic_result is {}'.format(
-            dygraph_res, static_res
-        ),
+        err_msg=f'dygraph result is {dygraph_res}\nstatic_result is {static_res}',
     )
 
 
diff --git a/test/dygraph_to_static/test_resnet.py b/test/dygraph_to_static/test_resnet.py
index 3d4585117c977..a99999c4e7447 100644
--- a/test/dygraph_to_static/test_resnet.py
+++ b/test/dygraph_to_static/test_resnet.py
@@ -143,9 +143,7 @@ def __init__(self, layers=50, class_dim=102):
         supported_layers = [50, 101, 152]
         assert (
             layers in supported_layers
-        ), "supported layers are {} but input layer is {}".format(
-            supported_layers, layers
-        )
+        ), f"supported layers are {supported_layers} but input layer is {layers}"
 
         if layers == 50:
             depth = [3, 4, 6, 3]
@@ -412,17 +410,13 @@ def verify_predict(self):
             dy_jit_pre,
             st_pre,
             rtol=1e-05,
-            err_msg='dy_jit_pre:\n {}\n, st_pre: \n{}.'.format(
-                dy_jit_pre, st_pre
-            ),
+            err_msg=f'dy_jit_pre:\n {dy_jit_pre}\n, st_pre: \n{st_pre}.',
         )
         np.testing.assert_allclose(
             predictor_pre,
             st_pre,
             rtol=1e-05,
-            err_msg='predictor_pre:\n {}\n, st_pre: \n{}.'.format(
-                predictor_pre, st_pre
-            ),
+            err_msg=f'predictor_pre:\n {predictor_pre}\n, st_pre: \n{st_pre}.',
         )
 
     @test_with_new_ir
@@ -433,9 +427,7 @@ def test_resnet_new_ir(self):
             static_loss,
             dygraph_loss,
             rtol=1e-05,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
 
     def test_resnet(self):
@@ -445,9 +437,7 @@ def test_resnet(self):
             static_loss,
             dygraph_loss,
             rtol=1e-05,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
         self.verify_predict()
 
@@ -460,9 +450,7 @@ def test_resnet_composite_forward_backward(self):
             static_loss,
             dygraph_loss,
             rtol=1e-02,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
 
     def test_in_static_mode_mkldnn(self):
diff --git a/test/dygraph_to_static/test_resnet_amp.py b/test/dygraph_to_static/test_resnet_amp.py
index 40c60520fbcad..60a30db707be4 100644
--- a/test/dygraph_to_static/test_resnet_amp.py
+++ b/test/dygraph_to_static/test_resnet_amp.py
@@ -124,9 +124,7 @@ def test_resnet(self):
             static_loss,
             dygraph_loss,
             rtol=1e-05,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
 
     def test_resnet_composite(self):
@@ -138,9 +136,7 @@ def test_resnet_composite(self):
             static_loss,
             dygraph_loss,
             rtol=1e-05,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
 
 
diff --git a/test/dygraph_to_static/test_resnet_pure_fp16.py b/test/dygraph_to_static/test_resnet_pure_fp16.py
index c878372fbd406..1eb6a8ac9b3a5 100644
--- a/test/dygraph_to_static/test_resnet_pure_fp16.py
+++ b/test/dygraph_to_static/test_resnet_pure_fp16.py
@@ -132,9 +132,7 @@ def test_resnet(self):
                 dygraph_loss,
                 rtol=1e-05,
                 atol=0.001,
-                err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                    static_loss, dygraph_loss
-                ),
+                err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
             )
 
     def test_resnet_composite(self):
@@ -149,9 +147,7 @@ def test_resnet_composite(self):
                 dygraph_loss,
                 rtol=1e-05,
                 atol=0.001,
-                err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                    static_loss, dygraph_loss
-                ),
+                err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
             )
 
 
diff --git a/test/dygraph_to_static/test_resnet_v2.py b/test/dygraph_to_static/test_resnet_v2.py
index 2b4e9676c5f36..cf941effd2c28 100644
--- a/test/dygraph_to_static/test_resnet_v2.py
+++ b/test/dygraph_to_static/test_resnet_v2.py
@@ -148,9 +148,7 @@ def __init__(self, layers=50, class_dim=102):
         supported_layers = [50, 101, 152]
         assert (
             layers in supported_layers
-        ), "supported layers are {} but input layer is {}".format(
-            supported_layers, layers
-        )
+        ), f"supported layers are {supported_layers} but input layer is {layers}"
 
         if layers == 50:
             depth = [3, 4, 6, 3]
@@ -419,17 +417,13 @@ def verify_predict(self):
             dy_jit_pre,
             st_pre,
             rtol=1e-05,
-            err_msg='dy_jit_pre:\n {}\n, st_pre: \n{}.'.format(
-                dy_jit_pre, st_pre
-            ),
+            err_msg=f'dy_jit_pre:\n {dy_jit_pre}\n, st_pre: \n{st_pre}.',
         )
         np.testing.assert_allclose(
             predictor_pre,
             st_pre,
             rtol=1e-05,
-            err_msg='predictor_pre:\n {}\n, st_pre: \n{}.'.format(
-                predictor_pre, st_pre
-            ),
+            err_msg=f'predictor_pre:\n {predictor_pre}\n, st_pre: \n{st_pre}.',
         )
 
     @test_with_new_ir
@@ -440,9 +434,7 @@ def test_resnet_new_ir(self):
             static_loss,
             dygraph_loss,
             rtol=1e-05,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
 
     def test_resnet(self):
@@ -452,9 +444,7 @@ def test_resnet(self):
             static_loss,
             dygraph_loss,
             rtol=1e-05,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
         self.verify_predict()
 
@@ -468,9 +458,7 @@ def test_resnet_composite(self):
             static_loss,
             dygraph_loss,
             rtol=1e-05,
-            err_msg='static_loss: {} \n dygraph_loss: {}'.format(
-                static_loss, dygraph_loss
-            ),
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
         )
 
     def test_in_static_mode_mkldnn(self):
diff --git a/test/dygraph_to_static/test_se_resnet.py b/test/dygraph_to_static/test_se_resnet.py
index aef9b3a2f0b6a..c12990b53659d 100644
--- a/test/dygraph_to_static/test_se_resnet.py
+++ b/test/dygraph_to_static/test_se_resnet.py
@@ -224,9 +224,7 @@ def __init__(self, layers=50, class_dim=102):
         supported_layers = [50, 101, 152]
         assert (
             layers in supported_layers
-        ), "supported layers are {} but input layer is {}".format(
-            supported_layers, layers
-        )
+        ), f"supported layers are {supported_layers} but input layer is {layers}"
 
         if layers == 50:
             cardinality = 32
@@ -542,9 +540,7 @@ def verify_predict(self):
             dy_jit_pre,
             st_pre,
             rtol=1e-05,
-            err_msg='dy_jit_pre:\n {}\n, st_pre: \n{}.'.format(
-                dy_jit_pre, st_pre
-            ),
+            err_msg=f'dy_jit_pre:\n {dy_jit_pre}\n, st_pre: \n{st_pre}.',
         )
 
         flat_st_pre = st_pre.flatten()
diff --git a/test/dygraph_to_static/test_seq2seq.py b/test/dygraph_to_static/test_seq2seq.py
index fee69b74bfdfe..85de170c3f06c 100644
--- a/test/dygraph_to_static/test_seq2seq.py
+++ b/test/dygraph_to_static/test_seq2seq.py
@@ -211,9 +211,7 @@ def _test_train(self, attn_model=False):
         result = np.allclose(dygraph_loss, static_loss)
         self.assertTrue(
             result,
-            msg="\ndygraph_loss = {} \nstatic_loss = {}".format(
-                dygraph_loss, static_loss
-            ),
+            msg=f"\ndygraph_loss = {dygraph_loss} \nstatic_loss = {static_loss}",
         )
 
     def _test_predict(self, attn_model=False):
@@ -222,9 +220,7 @@ def _test_predict(self, attn_model=False):
         result = np.allclose(pred_static, pred_dygraph)
         self.assertTrue(
             result,
-            msg="\npred_dygraph = {} \npred_static = {}".format(
-                pred_dygraph, pred_static
-            ),
+            msg=f"\npred_dygraph = {pred_dygraph} \npred_static = {pred_static}",
         )
 
     def test_base_model(self):
diff --git a/test/dygraph_to_static/yolov3.py b/test/dygraph_to_static/yolov3.py
index f72ee8bb8f8ff..8712a49b44a99 100644
--- a/test/dygraph_to_static/yolov3.py
+++ b/test/dygraph_to_static/yolov3.py
@@ -133,9 +133,7 @@ class YoloDetectionBlock(paddle.nn.Layer):
     def __init__(self, ch_in, channel, is_test=True):
         super().__init__()
 
-        assert channel % 2 == 0, "channel {} cannot be divided by 2".format(
-            channel
-        )
+        assert channel % 2 == 0, f"channel {channel} cannot be divided by 2"
 
         self.conv0 = ConvBNLayer(
             ch_in=ch_in,
diff --git a/test/fft/spectral_op_np.py b/test/fft/spectral_op_np.py
index fadc3349213b9..361cd04ddac8c 100644
--- a/test/fft/spectral_op_np.py
+++ b/test/fft/spectral_op_np.py
@@ -35,9 +35,7 @@ def _get_norm_mode(norm, forward):
 
 
 def _get_inv_norm(n, norm_mode):
-    assert isinstance(norm_mode, NormMode), "invalid norm_type {}".format(
-        norm_mode
-    )
+    assert isinstance(norm_mode, NormMode), f"invalid norm_type {norm_mode}"
     if norm_mode == NormMode.none:
         return 1.0
     if norm_mode == NormMode.by_sqrt_n:
diff --git a/test/ir/inference/test_trt_convert_multiclass_nms.py b/test/ir/inference/test_trt_convert_multiclass_nms.py
index 0033bf8aa4bdd..578a3f0c74ced 100644
--- a/test/ir/inference/test_trt_convert_multiclass_nms.py
+++ b/test/ir/inference/test_trt_convert_multiclass_nms.py
@@ -217,9 +217,7 @@ def assert_tensors_near(
                 arr,
                 rtol=rtol,
                 atol=atol,
-                err_msg='Output has diff, Maximum absolute error: {}'.format(
-                    np.amax(diff)
-                ),
+                err_msg=f'Output has diff, Maximum absolute error: {np.amax(diff)}',
             )
 
     def assert_op_size(self, trt_engine_num, paddle_op_num):
diff --git a/test/ir/inference/test_trt_convert_multiclass_nms3.py b/test/ir/inference/test_trt_convert_multiclass_nms3.py
index 60f2b0a68a41a..f221e10f5339f 100644
--- a/test/ir/inference/test_trt_convert_multiclass_nms3.py
+++ b/test/ir/inference/test_trt_convert_multiclass_nms3.py
@@ -226,9 +226,7 @@ def assert_tensors_near(
                 arr,
                 rtol=rtol,
                 atol=atol,
-                err_msg='Output has diff, Maximum absolute error: {}'.format(
-                    np.amax(diff)
-                ),
+                err_msg=f'Output has diff, Maximum absolute error: {np.amax(diff)}',
             )
 
     def assert_op_size(self, trt_engine_num, paddle_op_num):
diff --git a/test/ir/inference/test_trt_pool3d_op.py b/test/ir/inference/test_trt_pool3d_op.py
index 66a05775b071d..462d481cd7d66 100644
--- a/test/ir/inference/test_trt_pool3d_op.py
+++ b/test/ir/inference/test_trt_pool3d_op.py
@@ -147,9 +147,7 @@ def test(self):
         ):
             is_dynamic = True if dynamic_shape_options is not None else False
             with self.subTest(
-                'Precision: {}, Serialize: {}, Dynamic: {}'.format(
-                    precision, serialize, is_dynamic
-                )
+                f'Precision: {precision}, Serialize: {serialize}, Dynamic: {is_dynamic}'
             ):
                 self.precision = precision
                 self.serialize = serialize
@@ -258,9 +256,7 @@ def test(self):
         ):
             is_dynamic = True if dynamic_shape_options is not None else False
             with self.subTest(
-                'Precision: {}, Serialize: {}, Dynamic: {}'.format(
-                    precision, serialize, is_dynamic
-                )
+                f'Precision: {precision}, Serialize: {serialize}, Dynamic: {is_dynamic}'
             ):
                 self.precision = precision
                 self.serialize = serialize
@@ -359,9 +355,7 @@ def test(self):
         ):
             is_dynamic = True if dynamic_shape_options is not None else False
             with self.subTest(
-                'Precision: {}, Serialize: {}, Dynamic: {}'.format(
-                    precision, serialize, is_dynamic
-                )
+                f'Precision: {precision}, Serialize: {serialize}, Dynamic: {is_dynamic}'
             ):
                 self.precision = precision
                 self.serialize = serialize
diff --git a/test/ir/inference/test_trt_pool_op.py b/test/ir/inference/test_trt_pool_op.py
index 37ffe6452e0f5..0515eef7150fb 100644
--- a/test/ir/inference/test_trt_pool_op.py
+++ b/test/ir/inference/test_trt_pool_op.py
@@ -132,9 +132,7 @@ def test(self):
         ):
             is_dynamic = True if dynamic_shape_options is not None else False
             with self.subTest(
-                'Precision: {}, Serialize: {}, Dynamic: {}'.format(
-                    precision, serialize, is_dynamic
-                )
+                f'Precision: {precision}, Serialize: {serialize}, Dynamic: {is_dynamic}'
             ):
                 self.precision = precision
                 self.serialize = serialize
diff --git a/test/legacy_test/auto_parallel_autoconvert.py b/test/legacy_test/auto_parallel_autoconvert.py
index 5e7b501ed623b..6b41ee9bab180 100644
--- a/test/legacy_test/auto_parallel_autoconvert.py
+++ b/test/legacy_test/auto_parallel_autoconvert.py
@@ -151,14 +151,10 @@ def setUp(self):
 
     def tearDown(self):
         os.remove(
-            "./model_state_rank{}.pdmodel".format(
-                str(paddle.distributed.get_rank())
-            )
+            f"./model_state_rank{str(paddle.distributed.get_rank())}.pdmodel"
         )
         os.remove(
-            "./dist_attr_rank{}.pdattr".format(
-                str(paddle.distributed.get_rank())
-            )
+            f"./dist_attr_rank{str(paddle.distributed.get_rank())}.pdattr"
         )
 
     def test_mlp_mp2pp(self):
@@ -250,14 +246,10 @@ def setUp(self):
 
     def tearDown(self):
         os.remove(
-            "./model_state_rank{}.pdmodel".format(
-                str(paddle.distributed.get_rank())
-            )
+            f"./model_state_rank{str(paddle.distributed.get_rank())}.pdmodel"
         )
         os.remove(
-            "./dist_attr_rank{}.pdattr".format(
-                str(paddle.distributed.get_rank())
-            )
+            f"./dist_attr_rank{str(paddle.distributed.get_rank())}.pdattr"
         )
 
     def test_mlp_pp2mp(self):
diff --git a/test/legacy_test/benchmark.py b/test/legacy_test/benchmark.py
index bc3f2ae7810fb..53964eb6a7b19 100644
--- a/test/legacy_test/benchmark.py
+++ b/test/legacy_test/benchmark.py
@@ -86,9 +86,7 @@ def timeit_output(self, iters=100):
             elapses.append(self.timeit_output_with_place(place, iters))
         for place, elapse in zip(places, elapses):
             print(
-                "One pass of ({}_op) at {} cost {}".format(
-                    self.op_type, str(place), elapse
-                )
+                f"One pass of ({self.op_type}_op) at {str(place)} cost {elapse}"
             )
 
     def timeit_grad_with_place(self, place, iters=100):
@@ -110,7 +108,5 @@ def timeit_grad(self, iters=100):
             elapses.append(self.timeit_grad_with_place(place, iters))
         for place, elapse in zip(places, elapses):
             print(
-                "One pass of ({}_grad_op) at {} cost {}".format(
-                    self.op_type, str(place), elapse
-                )
+                f"One pass of ({self.op_type}_grad_op) at {str(place)} cost {elapse}"
             )
diff --git a/test/legacy_test/dist_fleet_ctr.py b/test/legacy_test/dist_fleet_ctr.py
index 64c4f69a55654..0d57a7c73e13d 100644
--- a/test/legacy_test/dist_fleet_ctr.py
+++ b/test/legacy_test/dist_fleet_ctr.py
@@ -200,9 +200,7 @@ def do_distributed_testing(self, fleet):
                     fetch_list=[self.avg_cost.name],
                 )
                 loss_val = np.mean(loss_val)
-                message = "TEST ---> batch_idx: {} loss: {}\n".format(
-                    batch_idx, loss_val
-                )
+                message = f"TEST ---> batch_idx: {batch_idx} loss: {loss_val}\n"
                 fleet.util.print_on_rank(message, 0)
         except base.core.EOFException:
             self.test_reader.reset()
@@ -240,9 +238,7 @@ def do_pyreader_training(self, fleet):
                     #       np.array(loss_val), mode="sum")
                     #   loss_all_trainer = fleet.util.all_gather(float(loss_val))
                     #   loss_val = float(reduce_output) / len(loss_all_trainer)
-                    message = "TRAIN ---> pass: {} loss: {}\n".format(
-                        epoch_id, loss_val
-                    )
+                    message = f"TRAIN ---> pass: {epoch_id} loss: {loss_val}\n"
                     fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
diff --git a/test/legacy_test/dist_fleet_ctr_ps_gpu.py b/test/legacy_test/dist_fleet_ctr_ps_gpu.py
index bf109e6a61306..093a93e9a2c85 100644
--- a/test/legacy_test/dist_fleet_ctr_ps_gpu.py
+++ b/test/legacy_test/dist_fleet_ctr_ps_gpu.py
@@ -80,9 +80,7 @@ def do_pyreader_training(self, fleet):
                     )
                     loss_all_trainer = fleet.util.all_gather(float(loss_val))
                     loss_val = float(reduce_output) / len(loss_all_trainer)
-                    message = "TRAIN ---> pass: {} loss: {}\n".format(
-                        epoch_id, loss_val
-                    )
+                    message = f"TRAIN ---> pass: {epoch_id} loss: {loss_val}\n"
                     fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
diff --git a/test/legacy_test/dist_fleet_simnet_bow.py b/test/legacy_test/dist_fleet_simnet_bow.py
index ffbe371cc228a..0d6827b7d339e 100644
--- a/test/legacy_test/dist_fleet_simnet_bow.py
+++ b/test/legacy_test/dist_fleet_simnet_bow.py
@@ -263,9 +263,7 @@ def do_pyreader_training(self, fleet):
                         fetch_list=[self.avg_cost.name],
                     )
                     loss_val = np.mean(loss_val)
-                    message = "TRAIN ---> pass: {} loss: {}\n".format(
-                        epoch_id, loss_val
-                    )
+                    message = f"TRAIN ---> pass: {epoch_id} loss: {loss_val}\n"
                     fleet.util.print_on_rank(message, 0)
 
                 pass_time = time.time() - pass_start
diff --git a/test/legacy_test/dist_fleet_sparse_embedding_ctr.py b/test/legacy_test/dist_fleet_sparse_embedding_ctr.py
index 120b7e51305d7..e5991b22d8a77 100644
--- a/test/legacy_test/dist_fleet_sparse_embedding_ctr.py
+++ b/test/legacy_test/dist_fleet_sparse_embedding_ctr.py
@@ -180,11 +180,7 @@ def do_pyreader_training(self, fleet):
                         fetch_list=[self.avg_cost.name],
                     )
                     loss_val = np.mean(loss_val)
-                    print(
-                        "TRAIN ---> pass: {} loss: {}\n".format(
-                            epoch_id, loss_val
-                        )
-                    )
+                    print(f"TRAIN ---> pass: {epoch_id} loss: {loss_val}\n")
             except base.core.EOFException:
                 self.reader.reset()
 
diff --git a/test/legacy_test/dist_fleet_sync_batch_norm.py b/test/legacy_test/dist_fleet_sync_batch_norm.py
index 86e8e921adf10..c01267800893a 100644
--- a/test/legacy_test/dist_fleet_sync_batch_norm.py
+++ b/test/legacy_test/dist_fleet_sync_batch_norm.py
@@ -94,9 +94,7 @@ def train(args):
     rank = paddle.distributed.get_rank()
     filepath = os.path.join(
         args.data_dir,
-        'input_{}_{}_{}_{}.npy'.format(
-            rank, args.only_forward, str(args.dtype), args.layout
-        ),
+        f'input_{rank}_{args.only_forward}_{str(args.dtype)}_{args.layout}.npy',
     )
     data = np.load(filepath)
 
@@ -110,9 +108,7 @@ def train(args):
     for i in range(0, len(sync_bn_fetches)):
         file_path = os.path.join(
             args.data_dir,
-            'output_{}_{}_{}_{}.npy'.format(
-                rank, args.only_forward, str(args.dtype), i
-            ),
+            f'output_{rank}_{args.only_forward}_{str(args.dtype)}_{i}.npy',
         )
         np.save(file_path, sync_bn_fetches[i])
 
diff --git a/test/legacy_test/dist_se_resnext.py b/test/legacy_test/dist_se_resnext.py
index 98b6af3af08ee..672cec6075dcd 100644
--- a/test/legacy_test/dist_se_resnext.py
+++ b/test/legacy_test/dist_se_resnext.py
@@ -47,9 +47,7 @@ def net(self, input, class_dim=1000):
         supported_layers = [50, 101, 152]
         assert (
             layers in supported_layers
-        ), "supported layers are {} but input layer is {}".format(
-            supported_layers, layers
-        )
+        ), f"supported layers are {supported_layers} but input layer is {layers}"
         if layers == 50:
             cardinality = 32
             reduction_ratio = 16
diff --git a/test/legacy_test/fleet_meta_optimizer_base.py b/test/legacy_test/fleet_meta_optimizer_base.py
index c0f2e2fc2a32f..11c05d75ce04b 100755
--- a/test/legacy_test/fleet_meta_optimizer_base.py
+++ b/test/legacy_test/fleet_meta_optimizer_base.py
@@ -41,9 +41,7 @@ def debug_program(self, main_prog, startup_prog):
         startup_prog_op_types = [op.type for op in startup_prog_ops]
 
         print(
-            "=== debug program and ops in func [{}] ===".format(
-                inspect.stack()[1].function
-            )
+            f"=== debug program and ops in func [{inspect.stack()[1].function}] ==="
         )
         print(main_prog)
         print(main_prog_op_types)
diff --git a/test/legacy_test/gradient_checker.py b/test/legacy_test/gradient_checker.py
index d146c22f08cf3..67e18075e60a0 100644
--- a/test/legacy_test/gradient_checker.py
+++ b/test/legacy_test/gradient_checker.py
@@ -321,11 +321,9 @@ def fail_test(msg):
         n = numerical[x_idx][y_idx]
         if not np.allclose(a, n, rtol, atol):
             msg = (
-                'Jacobian mismatch for output {} '
-                'with respect to input {} on {},\n'
-                'numerical:{}\nanalytical:{}\n'.format(
-                    y[y_idx].name, x[x_idx].name, str(place), n, a
-                )
+                f'Jacobian mismatch for output {y[y_idx].name} '
+                f'with respect to input {x[x_idx].name} on {str(place)},\n'
+                f'numerical:{n}\nanalytical:{a}\n'
             )
             return fail_test(msg)
     return True
diff --git a/test/legacy_test/test_chunk_eval_op.py b/test/legacy_test/test_chunk_eval_op.py
index b059c04f8e0f4..b9db50079b4b3 100644
--- a/test/legacy_test/test_chunk_eval_op.py
+++ b/test/legacy_test/test_chunk_eval_op.py
@@ -25,11 +25,7 @@ def __init__(self, chunk_type, start_idx, end_idx):
         self.end_idx = end_idx
 
     def __str__(self):
-        return '(Segment: {}, {}, {})'.format(
-            self.chunk_type,
-            self.start_idx,
-            self.end_idx,
-        )
+        return f'(Segment: {self.chunk_type}, {self.start_idx}, {self.end_idx})'
 
     __repr__ = __str__
 
diff --git a/test/legacy_test/test_detach.py b/test/legacy_test/test_detach.py
index 5bb336866733a..53c252055bc68 100644
--- a/test/legacy_test/test_detach.py
+++ b/test/legacy_test/test_detach.py
@@ -214,9 +214,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c + var_d)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    1, 0
-                ),
+                f"received tensor_version:{1} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
diff --git a/test/legacy_test/test_dist_base.py b/test/legacy_test/test_dist_base.py
index 2e6895b717579..db7d490e3a5af 100755
--- a/test/legacy_test/test_dist_base.py
+++ b/test/legacy_test/test_dist_base.py
@@ -1135,9 +1135,8 @@ def _run_local(
             envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
             cmd += " -m coverage run --branch -p"
 
-        cmd += " {} --role trainer --update_method local --lr {:f}".format(
-            model,
-            self._lr,
+        cmd += (
+            f" {model} --role trainer --update_method local --lr {self._lr:f}"
         )
 
         if batch_size != DEFAULT_BATCH_SIZE:
@@ -1522,9 +1521,7 @@ def _run_cluster_gloo(
             tr_env["GLOG_vmodule"] = 'gloo_context=4'
             tr_env["GLOG_v"] = '3'
             print(
-                "use_hallreduce:{} tr_cmd:{}, env: {}".format(
-                    self._use_hallreduce, tr_cmd, tr_env
-                )
+                f"use_hallreduce:{self._use_hallreduce} tr_cmd:{tr_cmd}, env: {tr_env}"
             )
 
             path = os.path.join(
@@ -1596,9 +1593,7 @@ def _run_cluster_nccl2(
             )
             tr_env.update(envs)
             print(
-                "use_hallreduce:{} tr_cmd:{}, env: {}".format(
-                    self._use_hallreduce, tr_cmd, tr_env
-                )
+                f"use_hallreduce:{self._use_hallreduce} tr_cmd:{tr_cmd}, env: {tr_env}"
             )
 
             path = os.path.join(
diff --git a/test/legacy_test/test_dist_fleet_base.py b/test/legacy_test/test_dist_fleet_base.py
index ad421c228b0e5..94d6f836750b0 100644
--- a/test/legacy_test/test_dist_fleet_base.py
+++ b/test/legacy_test/test_dist_fleet_base.py
@@ -423,18 +423,14 @@ def is_listen_failed(logx):
         def catlog(logx):
             basename = os.path.basename(logx)
             print(
-                "\n================== Error {} begin =====================".format(
-                    basename
-                )
+                f"\n================== Error {basename} begin ====================="
             )
 
             if not os.path.isfile(logx):
                 raise FileNotFoundError(f"{logx} is not a file")
             os.system(f"cat {logx}")
             print(
-                "================== Error {} end =====================\n".format(
-                    basename
-                )
+                f"================== Error {basename} end =====================\n"
             )
 
         if tr0_ret != 0 or tr1_ret != 0:
diff --git a/test/legacy_test/test_eager_deletion_delete_vars.py b/test/legacy_test/test_eager_deletion_delete_vars.py
index 7420e15981c27..e61cccc83b201 100644
--- a/test/legacy_test/test_eager_deletion_delete_vars.py
+++ b/test/legacy_test/test_eager_deletion_delete_vars.py
@@ -110,15 +110,9 @@ def assertScopeVar(self, scope, persitables, non_persistables):
             if t._is_initialized():
                 outline_np_vars.append(name)
 
+        print(f'Non-alive persistable vars {outline_p_vars} in {persitables}')
         print(
-            'Non-alive persistable vars {} in {}'.format(
-                outline_p_vars, persitables
-            )
-        )
-        print(
-            'Alive non-persistable vars {} in {}'.format(
-                outline_np_vars, non_persistables
-            )
+            f'Alive non-persistable vars {outline_np_vars} in {non_persistables}'
         )
         self.assertEqual(len(outline_p_vars), 0)
         self.assertEqual(len(outline_np_vars), 0)
diff --git a/test/legacy_test/test_fused_dropout_add_op.py b/test/legacy_test/test_fused_dropout_add_op.py
index 6466775f432da..699d44bb37891 100644
--- a/test/legacy_test/test_fused_dropout_add_op.py
+++ b/test/legacy_test/test_fused_dropout_add_op.py
@@ -101,8 +101,8 @@ def setUp(self):
             self.mode = mode
             self.seed = seed
 
-    cls_name = "{}_{}_{}_{}_{}_{}".format(
-        parent.__name__, dtype, mode, str(training), str(p), str(seed)
+    cls_name = (
+        f"{parent.__name__}_{dtype}_{mode}_{str(training)}_{str(p)}_{str(seed)}"
     )
     TestFusedDropoutAddCase.__name__ = cls_name
     globals()[cls_name] = TestFusedDropoutAddCase
diff --git a/test/legacy_test/test_generate_proposals_op.py b/test/legacy_test/test_generate_proposals_op.py
index 7ce1aa5e4666b..901d009effc5b 100644
--- a/test/legacy_test/test_generate_proposals_op.py
+++ b/test/legacy_test/test_generate_proposals_op.py
@@ -208,9 +208,7 @@ def clip_tiled_boxes(boxes, im_shape, pixel_offset=True):
     has shape (N, 4 * num_tiled_boxes)."""
     assert (
         boxes.shape[1] % 4 == 0
-    ), 'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
-        boxes.shape[1]
-    )
+    ), f'boxes.shape[1] is {boxes.shape[1]:d}, but must be divisible by 4.'
     offset = 1 if pixel_offset else 0
     # x1 >= 0
     boxes[:, 0::4] = np.maximum(
diff --git a/test/legacy_test/test_generator_dataloader.py b/test/legacy_test/test_generator_dataloader.py
index 7de57eb3eb5ca..9216e5a437970 100644
--- a/test/legacy_test/test_generator_dataloader.py
+++ b/test/legacy_test/test_generator_dataloader.py
@@ -134,9 +134,9 @@ def run_main(
                 for _ in range(EPOCH_NUM):
                     step = 0
                     for d in py_reader():
-                        assert len(d) == len(places), "{} != {}".format(
-                            len(d), len(places)
-                        )
+                        assert len(d) == len(
+                            places
+                        ), f"{len(d)} != {len(places)}"
                         for i, item in enumerate(d):
                             image = item['image']
                             label = item['label']
diff --git a/test/legacy_test/test_imperative_resnet.py b/test/legacy_test/test_imperative_resnet.py
index b91f840a57bf0..edcc1cf39cba0 100644
--- a/test/legacy_test/test_imperative_resnet.py
+++ b/test/legacy_test/test_imperative_resnet.py
@@ -171,9 +171,7 @@ def __init__(self, layers=50, class_dim=102, use_cudnn=True):
         supported_layers = [50, 101, 152]
         assert (
             layers in supported_layers
-        ), "supported layers are {} but input layer is {}".format(
-            supported_layers, layers
-        )
+        ), f"supported layers are {supported_layers} but input layer is {layers}"
 
         if layers == 50:
             depth = [3, 4, 6, 3]
diff --git a/test/legacy_test/test_imperative_se_resnext.py b/test/legacy_test/test_imperative_se_resnext.py
index 24bc9e56d0e1d..51ca82499b629 100644
--- a/test/legacy_test/test_imperative_se_resnext.py
+++ b/test/legacy_test/test_imperative_se_resnext.py
@@ -199,9 +199,7 @@ def __init__(self, layers=50, class_dim=102):
         supported_layers = [50, 101, 152]
         assert (
             layers in supported_layers
-        ), "supported layers are {} but input layer is {}".format(
-            supported_layers, layers
-        )
+        ), f"supported layers are {supported_layers} but input layer is {layers}"
 
         if layers == 50:
             cardinality = 32
diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py
index 5b53b55f5f96a..676977ba2ac48 100644
--- a/test/legacy_test/test_inplace.py
+++ b/test/legacy_test/test_inplace.py
@@ -56,9 +56,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c + var_d)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    1, 0
-                ),
+                f"received tensor_version:{1} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
@@ -173,9 +171,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    1, 0
-                ),
+                f"received tensor_version:{1} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
@@ -890,9 +886,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    3, 0
-                ),
+                f"received tensor_version:{3} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
@@ -981,9 +975,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    2, 0
-                ),
+                f"received tensor_version:{2} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
@@ -1059,9 +1051,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    2, 0
-                ),
+                f"received tensor_version:{2} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
@@ -1357,9 +1347,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    2, 0
-                ),
+                f"received tensor_version:{2} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
@@ -1401,9 +1389,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    2, 0
-                ),
+                f"received tensor_version:{2} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
diff --git a/test/legacy_test/test_layers.py b/test/legacy_test/test_layers.py
index 1055a7d8695d4..5d8087e7138d3 100644
--- a/test/legacy_test/test_layers.py
+++ b/test/legacy_test/test_layers.py
@@ -1509,9 +1509,7 @@ def test_all_layers(self):
                     dy_result_value,
                     rtol=1e-05,
                     atol=0,
-                    err_msg='Result of function [{}] compare failed'.format(
-                        method.__name__
-                    ),
+                    err_msg=f'Result of function [{method.__name__}] compare failed',
                 )
                 continue
 
@@ -1519,9 +1517,7 @@ def test_all_layers(self):
                 np.testing.assert_array_equal(
                     static_result[0],
                     dy_result_value,
-                    err_msg='Result of function [{}] not equal'.format(
-                        method.__name__
-                    ),
+                    err_msg=f'Result of function [{method.__name__}] not equal',
                 )
 
     def _get_np_data(self, shape, dtype, append_batch_size=True):
diff --git a/test/legacy_test/test_lstm_cudnn_op.py b/test/legacy_test/test_lstm_cudnn_op.py
index e36d4b01a866f..2d61b7c8f9a2d 100644
--- a/test/legacy_test/test_lstm_cudnn_op.py
+++ b/test/legacy_test/test_lstm_cudnn_op.py
@@ -384,7 +384,7 @@ def __init__(
         else:
             raise ValueError(
                 "direction should be forward, backward or bidirectional, "
-                "received direction = {}".format(direction)
+                f"received direction = {direction}"
             )
 
         self.input_size = input_size
diff --git a/test/legacy_test/test_multi_dot_op.py b/test/legacy_test/test_multi_dot_op.py
index c18ee94fb01e6..28f8dde3c05c7 100644
--- a/test/legacy_test/test_multi_dot_op.py
+++ b/test/legacy_test/test_multi_dot_op.py
@@ -318,9 +318,7 @@ def test_out(self):
             expected_result,
             rtol=1e-05,
             atol=1e-05,
-            err_msg='two value is            {}\n{}, check diff!'.format(
-                np_res, expected_result
-            ),
+            err_msg=f'two value is            {np_res}\n{expected_result}, check diff!',
         )
 
     def test_dygraph_without_out(self):
diff --git a/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py b/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py
index cf7ff971ca711..845f4b0e80582 100644
--- a/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py
+++ b/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py
@@ -137,9 +137,7 @@ def run_main(self, num_workers, places, persistent_workers):
             for i in range(EPOCH_NUM):
                 step = 0
                 for d in dataloader:
-                    assert len(d) == len(places), "{} != {}".format(
-                        len(d), len(places)
-                    )
+                    assert len(d) == len(places), f"{len(d)} != {len(places)}"
                     for i, item in enumerate(d):
                         image = item['image']
                         label = item['label']
@@ -239,9 +237,7 @@ def run_main(self, num_workers, places, persistent_workers):
             for i in range(EPOCH_NUM):
                 step = 0
                 for d in dataloader:
-                    assert len(d) == len(places), "{} != {}".format(
-                        len(d), len(places)
-                    )
+                    assert len(d) == len(places), f"{len(d)} != {len(places)}"
                     for i, item in enumerate(d):
                         image = item['image']
                         label = item['label']
diff --git a/test/legacy_test/test_multiprocess_dataloader_static.py b/test/legacy_test/test_multiprocess_dataloader_static.py
index fed4534c52a02..2d465fd818df0 100644
--- a/test/legacy_test/test_multiprocess_dataloader_static.py
+++ b/test/legacy_test/test_multiprocess_dataloader_static.py
@@ -137,9 +137,7 @@ def run_main(self, num_workers, places, persistent_workers):
             for _ in range(EPOCH_NUM):
                 step = 0
                 for d in dataloader:
-                    assert len(d) == len(places), "{} != {}".format(
-                        len(d), len(places)
-                    )
+                    assert len(d) == len(places), f"{len(d)} != {len(places)}"
                     for i, item in enumerate(d):
                         image = item['image']
                         label = item['label']
@@ -298,9 +296,7 @@ def run_main(self, num_workers, places, persistent_workers):
             for _ in range(EPOCH_NUM):
                 step = 0
                 for d in dataloader:
-                    assert len(d) == len(places), "{} != {}".format(
-                        len(d), len(places)
-                    )
+                    assert len(d) == len(places), f"{len(d)} != {len(places)}"
                     for i, item in enumerate(d):
                         image = item['image']
                         label = item['label']
diff --git a/test/legacy_test/test_ops_nms.py b/test/legacy_test/test_ops_nms.py
index 8b95329eb56d3..0e6a5d9545543 100644
--- a/test/legacy_test/test_ops_nms.py
+++ b/test/legacy_test/test_ops_nms.py
@@ -227,9 +227,7 @@ def fun(x):
                 np.testing.assert_array_equal(
                     origin,
                     res,
-                    err_msg='origin out: {}\n inference model out: {}\n'.format(
-                        origin, res
-                    ),
+                    err_msg=f'origin out: {origin}\n inference model out: {res}\n',
                 )
 
     def test_matrix_nms_dynamic(self):
diff --git a/test/legacy_test/test_pylayer_op.py b/test/legacy_test/test_pylayer_op.py
index 3f8b14f06d1f0..70bf6d947b37b 100644
--- a/test/legacy_test/test_pylayer_op.py
+++ b/test/legacy_test/test_pylayer_op.py
@@ -467,9 +467,7 @@ def forward(self, data):
         z = layer(data)
         with self.assertRaisesRegex(
             RuntimeError,
-            "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                1, 0
-            ),
+            f"received tensor_version:{1} != wrapper_version_snapshot:{0}",
         ):
             z.backward()
 
diff --git a/test/legacy_test/test_run.py b/test/legacy_test/test_run.py
index d870e346cd269..e0ec7c9657fb5 100644
--- a/test/legacy_test/test_run.py
+++ b/test/legacy_test/test_run.py
@@ -89,9 +89,7 @@ def test_collective_1(self):
 
     def test_collective_2(self):
         log_dir = tempfile.TemporaryDirectory()
-        args = "--job_id test2 --devices 0,1,2 --log_dir {}".format(
-            log_dir.name
-        )
+        args = f"--job_id test2 --devices 0,1,2 --log_dir {log_dir.name}"
         p = self.pdrun(args)
         p.wait()
         self.assertTrue(p.poll() == 0)
@@ -166,11 +164,7 @@ def test_ps_1(self):
 
     def test_ps_2(self):
         log_dir = tempfile.TemporaryDirectory()
-        args = (
-            "--job_id ps2 --server_num=2 --trainer_num=2 --log_dir {}".format(
-                log_dir.name
-            )
-        )
+        args = f"--job_id ps2 --server_num=2 --trainer_num=2 --log_dir {log_dir.name}"
         p = self.pdrun(args)
         p.wait()
         self.assertTrue(p.poll() == 0)
diff --git a/test/legacy_test/test_sample_logits_op.py b/test/legacy_test/test_sample_logits_op.py
index 4f29b62ae1744..64c70b5a8a07c 100644
--- a/test/legacy_test/test_sample_logits_op.py
+++ b/test/legacy_test/test_sample_logits_op.py
@@ -76,19 +76,13 @@ def test_check_output(self):
             ), f"Samples dtype is {Samples.dtype}, not int64"
             assert (
                 Probabilities.dtype == np.float64
-            ), "Probabilities dtype is {}, not float64".format(
-                Probabilities.dtype
-            )
+            ), f"Probabilities dtype is {Probabilities.dtype}, not float64"
             assert (
                 SampledLogits.dtype == np.float64
-            ), "SampledLogits dtype is {}, not float64".format(
-                SampledLogits.dtype
-            )
+            ), f"SampledLogits dtype is {SampledLogits.dtype}, not float64"
             assert (
                 SampledLabels.dtype == np.int64
-            ), "SampledLabels dtype is {}, not int64".format(
-                SampledLabels.dtype
-            )
+            ), f"SampledLabels dtype is {SampledLabels.dtype}, not int64"
 
             assert Samples.shape == (self.bs, self.NT + self.S)
             assert Probabilities.shape == (self.bs, self.NT + self.S)
diff --git a/test/legacy_test/test_signal.py b/test/legacy_test/test_signal.py
index 013ea22fe6f51..1d86f15f51095 100644
--- a/test/legacy_test/test_signal.py
+++ b/test/legacy_test/test_signal.py
@@ -73,9 +73,7 @@ def normalize(S, norm=np.inf, axis=0, threshold=None, fill=None):
         threshold = tiny(S)
 
     elif threshold <= 0:
-        raise Exception(
-            "threshold={} must be strictly " "positive".format(threshold)
-        )
+        raise Exception(f"threshold={threshold} must be strictly " "positive")
 
     if fill not in [None, False, True]:
         raise Exception(f"fill={fill} must be None or boolean")
@@ -213,14 +211,13 @@ def dtype_r2c(d, default=np.complex64):
 def frame(x, frame_length, hop_length, axis=-1):
     if not isinstance(x, np.ndarray):
         raise Exception(
-            "Input must be of type numpy.ndarray, "
-            "given type(x)={}".format(type(x))
+            "Input must be of type numpy.ndarray, " f"given type(x)={type(x)}"
         )
 
     if x.shape[axis] < frame_length:
         raise Exception(
-            "Input is too short (n={:d})"
-            " for frame_length={:d}".format(x.shape[axis], frame_length)
+            f"Input is too short (n={x.shape[axis]:d})"
+            f" for frame_length={frame_length:d}"
         )
 
     if hop_length < 1:
@@ -228,18 +225,14 @@ def frame(x, frame_length, hop_length, axis=-1):
 
     if axis == -1 and not x.flags["F_CONTIGUOUS"]:
         print(
-            "librosa.util.frame called with axis={} "
-            "on a non-contiguous input. This will result in a copy.".format(
-                axis
-            )
+            f"librosa.util.frame called with axis={axis} "
+            "on a non-contiguous input. This will result in a copy."
         )
         x = np.asfortranarray(x)
     elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
         print(
-            "librosa.util.frame called with axis={} "
-            "on a non-contiguous input. This will result in a copy.".format(
-                axis
-            )
+            f"librosa.util.frame called with axis={axis} "
+            "on a non-contiguous input. This will result in a copy."
         )
         x = np.ascontiguousarray(x)
 
@@ -274,9 +267,7 @@ def pad_center(data, size, axis=-1, **kwargs):
 
     if lpad < 0:
         raise Exception(
-            ("Target size ({:d}) must be " "at least input size ({:d})").format(
-                size, n
-            )
+            f"Target size ({size:d}) must be " f"at least input size ({n:d})"
         )
 
     return np.pad(data, lengths, **kwargs)
@@ -295,9 +286,7 @@ def get_window(window, Nx, fftbins=True):
         if len(window) == Nx:
             return np.asarray(window)
 
-        raise Exception(
-            "Window size mismatch: " "{:d} != {:d}".format(len(window), Nx)
-        )
+        raise Exception("Window size mismatch: " f"{len(window):d} != {Nx:d}")
     else:
         raise Exception(f"Invalid window specification: {window}")
 
@@ -350,18 +339,14 @@ def stft(
     if center:
         if n_fft > y.shape[-1]:
             print(
-                "n_fft={} is too small for input signal of length={}".format(
-                    n_fft, y.shape[-1]
-                )
+                f"n_fft={n_fft} is too small for input signal of length={y.shape[-1]}"
             )
 
         y = np.pad(y, int(n_fft // 2), mode=pad_mode)
 
     elif n_fft > y.shape[-1]:
         raise Exception(
-            "n_fft={} is too large for input signal of length={}".format(
-                n_fft, y.shape[-1]
-            )
+            f"n_fft={n_fft} is too large for input signal of length={y.shape[-1]}"
         )
 
     # Window the time series.
diff --git a/test/legacy_test/test_static_save_load.py b/test/legacy_test/test_static_save_load.py
index 09e204e62191e..b46f49dbfefc9 100644
--- a/test/legacy_test/test_static_save_load.py
+++ b/test/legacy_test/test_static_save_load.py
@@ -926,9 +926,7 @@ def set_var(var, ndarray):
         for v in parameter_list:
             assert (
                 v.name in load_dict
-            ), "Can not find [{}] in model file [{}]".format(
-                v.name, parameter_file_name
-            )
+            ), f"Can not find [{v.name}] in model file [{parameter_file_name}]"
             new_v = new_scope.find_var(v.name)
             set_var(new_v, load_dict[v.name])
 
@@ -949,9 +947,7 @@ def set_var(var, ndarray):
         for v in opt_list:
             assert (
                 v.name in load_dict
-            ), "Can not find [{}] in model file [{}]".format(
-                v.name, opt_file_name
-            )
+            ), f"Can not find [{v.name}] in model file [{opt_file_name}]"
 
             new_v = new_scope.find_var(v.name)
             set_var(new_v, load_dict[v.name])
diff --git a/test/legacy_test/test_sync_batch_norm_op.py b/test/legacy_test/test_sync_batch_norm_op.py
index 68cb93c31d91e..0375ee7c52776 100644
--- a/test/legacy_test/test_sync_batch_norm_op.py
+++ b/test/legacy_test/test_sync_batch_norm_op.py
@@ -216,9 +216,7 @@ def _compare_impl(self, place, layout, only_forward):
         for id in range(core.get_cuda_device_count()):
             filepath = os.path.join(
                 self.data_dir.name,
-                'input_{}_{}_{}_{}.npy'.format(
-                    id, only_forward, str(self.dtype.__name__), layout
-                ),
+                f'input_{id}_{only_forward}_{str(self.dtype.__name__)}_{layout}.npy',
             )
             np.save(filepath, data[id * stride : (id + 1) * stride])
         data = create_or_get_tensor(
@@ -282,9 +280,7 @@ def _compare_impl(self, place, layout, only_forward):
             bn_val = bn_fetches[i]
             file_path = os.path.join(
                 self.data_dir.name,
-                'output_{}_{}_{}_{}.npy'.format(
-                    0, only_forward, self.dtype.__name__, i
-                ),
+                f'output_{0}_{only_forward}_{self.dtype.__name__}_{i}.npy',
             )
             sync_bn_val = np.load(file_path)
             if sync_bn_val.shape != bn_val.shape:
diff --git a/test/legacy_test/test_translated_layer.py b/test/legacy_test/test_translated_layer.py
index 6e87f24bb13e9..8d8a9d919f366 100644
--- a/test/legacy_test/test_translated_layer.py
+++ b/test/legacy_test/test_translated_layer.py
@@ -72,9 +72,7 @@ def train(layer, loader, loss_fn, opt):
             opt.step()
             opt.clear_grad()
             print(
-                "Epoch {} batch {}: loss = {}".format(
-                    epoch_id, batch_id, np.mean(loss.numpy())
-                )
+                f"Epoch {epoch_id} batch {batch_id}: loss = {np.mean(loss.numpy())}"
             )
     return loss
 
@@ -158,9 +156,7 @@ def load_and_fine_tuning(self):
         np.testing.assert_array_equal(
             orig_loss.numpy(),
             loss.numpy(),
-            err_msg='original loss:\n{}\nnew loss:\n{}\n'.format(
-                orig_loss.numpy(), loss.numpy()
-            ),
+            err_msg=f'original loss:\n{orig_loss.numpy()}\nnew loss:\n{loss.numpy()}\n',
         )
 
     def test_get_program(self):
diff --git a/test/legacy_test/test_tril_triu_op.py b/test/legacy_test/test_tril_triu_op.py
index fee9b6f95023a..a3add39f00f3f 100644
--- a/test/legacy_test/test_tril_triu_op.py
+++ b/test/legacy_test/test_tril_triu_op.py
@@ -100,16 +100,12 @@ def case_generator(op_type, Xshape, diagonal, expected, dtype):
     If arg`expercted` is 'success', it will register an Optest case and expect to pass.
     Otherwise, it will register an API case and check the expect failure.
     """
-    cls_name = "{}_{}_shape_{}_diag_{}_dtype_{}".format(
-        expected, op_type, Xshape, diagonal, dtype
+    cls_name = (
+        f"{expected}_{op_type}_shape_{Xshape}_diag_{diagonal}_dtype_{dtype}"
     )
     errmsg = {
-        "diagonal: TypeError": "diagonal in {} must be a python Int".format(
-            op_type
-        ),
-        "input: ValueError": "x shape in {} must be at least 2-D".format(
-            op_type
-        ),
+        "diagonal: TypeError": f"diagonal in {op_type} must be a python Int",
+        "input: ValueError": f"x shape in {op_type} must be at least 2-D",
     }
 
     class FailureCase(unittest.TestCase):
diff --git a/test/legacy_test/test_variable.py b/test/legacy_test/test_variable.py
index 024527f1332c4..2daf7017bbcae 100644
--- a/test/legacy_test/test_variable.py
+++ b/test/legacy_test/test_variable.py
@@ -1046,16 +1046,12 @@ def test_static_graph_tensor_index_setitem_muti_dim(self):
                 np.testing.assert_array_equal(
                     array2,
                     setitem_pp[0],
-                    err_msg='\n numpy:{},\n paddle:{}'.format(
-                        array2, setitem_pp[0]
-                    ),
+                    err_msg=f'\n numpy:{array2},\n paddle:{setitem_pp[0]}',
                 )
                 np.testing.assert_array_equal(
                     array3,
                     setitem_pp[1],
-                    err_msg='\n numpy:{},\n paddle:{}'.format(
-                        array3, setitem_pp[1]
-                    ),
+                    err_msg=f'\n numpy:{array3},\n paddle:{setitem_pp[1]}',
                 )
             array = array[0]
             index1 = index1[0]
@@ -1122,31 +1118,23 @@ def test_static_graph_array_index_muti_dim(self):
                 np.testing.assert_array_equal(
                     array2,
                     setitem_pp[0],
-                    err_msg='\n numpy:{},\n paddle:{}'.format(
-                        array2, setitem_pp[0]
-                    ),
+                    err_msg=f'\n numpy:{array2},\n paddle:{setitem_pp[0]}',
                 )
                 np.testing.assert_array_equal(
                     array3,
                     setitem_pp[1],
-                    err_msg='\n numpy:{},\n paddle:{}'.format(
-                        array3, setitem_pp[1]
-                    ),
+                    err_msg=f'\n numpy:{array3},\n paddle:{setitem_pp[1]}',
                 )
 
                 np.testing.assert_array_equal(
                     y_np1,
                     setitem_pp[2],
-                    err_msg='\n numpy:{},\n paddle:{}'.format(
-                        y_np1, setitem_pp[2]
-                    ),
+                    err_msg=f'\n numpy:{y_np1},\n paddle:{setitem_pp[2]}',
                 )
                 np.testing.assert_array_equal(
                     y_np2,
                     setitem_pp[3],
-                    err_msg='\n numpy:{},\n paddle:{}'.format(
-                        y_np2, setitem_pp[3]
-                    ),
+                    err_msg=f'\n numpy:{y_np2},\n paddle:{setitem_pp[3]}',
                 )
             array = array[0]
             index1 = index1[0]
@@ -1205,9 +1193,7 @@ def test_dygraph_array_index_muti_dim(self):
             np.testing.assert_array_equal(
                 tensor1.numpy(),
                 array1,
-                err_msg='\n numpy:{},\n paddle:{}'.format(
-                    array1, tensor1.numpy()
-                ),
+                err_msg=f'\n numpy:{array1},\n paddle:{tensor1.numpy()}',
             )
             # 1 dim setitem
             array2 = array.copy()
@@ -1219,9 +1205,7 @@ def test_dygraph_array_index_muti_dim(self):
             np.testing.assert_array_equal(
                 tensor2.numpy(),
                 array2,
-                err_msg='\n numpy:{},\n paddle:{}'.format(
-                    array2, tensor2.numpy()
-                ),
+                err_msg=f'\n numpy:{array2},\n paddle:{tensor2.numpy()}',
             )
 
             array = array[0]
diff --git a/test/legacy_test/test_view_op_reuse_allocation.py b/test/legacy_test/test_view_op_reuse_allocation.py
index ea48c9addb5b3..0b99f42a6bab4 100644
--- a/test/legacy_test/test_view_op_reuse_allocation.py
+++ b/test/legacy_test/test_view_op_reuse_allocation.py
@@ -80,9 +80,7 @@ def test_backward_error(self):
             loss = paddle.nn.functional.relu(var_c)
             with self.assertRaisesRegex(
                 RuntimeError,
-                "received tensor_version:{} != wrapper_version_snapshot:{}".format(
-                    1, 0
-                ),
+                f"received tensor_version:{1} != wrapper_version_snapshot:{0}",
             ):
                 loss.backward()
 
diff --git a/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py b/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
index 71e484f87569b..dec8a27bcd394 100644
--- a/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
+++ b/test/mkldnn/test_onnx_format_quantization_mobilenetv1.py
@@ -164,8 +164,8 @@ def tearDown(self):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path
+            cmd = (
+                f'mkdir {target_folder} && tar xf {zip_path} -C {target_folder}'
             )
             os.system(cmd)
 
diff --git a/test/ps/static_gpubox_trainer.py b/test/ps/static_gpubox_trainer.py
index 5940e3942cac1..9b4d07e9ef70d 100755
--- a/test/ps/static_gpubox_trainer.py
+++ b/test/ps/static_gpubox_trainer.py
@@ -173,9 +173,7 @@ def dataset_train_loop(self, epoch):
         start_time = time.time()
         self.reader.load_into_memory()
         print(
-            "self.reader.load_into_memory cost :{} seconds".format(
-                time.time() - start_time
-            )
+            f"self.reader.load_into_memory cost :{time.time() - start_time} seconds"
         )
 
         begin_pass_time = time.time()
diff --git a/test/quantization/quant2_int8_image_classification_comparison.py b/test/quantization/quant2_int8_image_classification_comparison.py
index 5c9954df91118..71505e7f84ee6 100644
--- a/test/quantization/quant2_int8_image_classification_comparison.py
+++ b/test/quantization/quant2_int8_image_classification_comparison.py
@@ -113,9 +113,7 @@ def reader():
                 while step < num:
                     fp.seek(imgs_offset + img_size * step)
                     img = fp.read(img_size)
-                    img = struct.unpack_from(
-                        '{}f'.format(img_ch * img_w * img_h), img
-                    )
+                    img = struct.unpack_from(f'{img_ch * img_w * img_h}f', img)
                     img = np.array(img)
                     img.shape = (img_ch, img_w, img_h)
                     fp.seek(labels_offset + label_size * step)
@@ -310,17 +308,11 @@ def _predict(
             return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
 
     def _print_performance(self, title, fps, lat):
-        _logger.info(
-            '{}: avg fps: {:.2f}, avg latency: {:.4f} ms'.format(
-                title, fps, lat
-            )
-        )
+        _logger.info(f'{title}: avg fps: {fps:.2f}, avg latency: {lat:.4f} ms')
 
     def _print_accuracy(self, title, acc1, acc5):
         _logger.info(
-            '{}: avg top1 accuracy: {:.4f}, avg top5 accuracy: {:.4f}'.format(
-                title, acc1, acc5
-            )
+            f'{title}: avg top1 accuracy: {acc1:.4f}, avg top5 accuracy: {acc5:.4f}'
         )
 
     def _summarize_performance(self, int8_fps, int8_lat, fp32_fps, fp32_lat):
diff --git a/test/quantization/quant2_int8_lstm_model.py b/test/quantization/quant2_int8_lstm_model.py
index 5bbb378e9c35e..8cfa3ab04666e 100644
--- a/test/quantization/quant2_int8_lstm_model.py
+++ b/test/quantization/quant2_int8_lstm_model.py
@@ -251,21 +251,15 @@ def test_lstm_model(self):
         )
 
         print(
-            "FP32: fps {}, hx_acc {}, ctc_acc {}".format(
-                fp32_fps, fp32_hx_acc, fp32_ctc_acc
-            )
+            f"FP32: fps {fp32_fps}, hx_acc {fp32_hx_acc}, ctc_acc {fp32_ctc_acc}"
         )
 
         print(
-            "PTQ_INT8: fps {}, hx_acc {}, ctc_acc {}".format(
-                int8_fps, int8_hx_acc, int8_ctc_acc
-            )
+            f"PTQ_INT8: fps {int8_fps}, hx_acc {int8_hx_acc}, ctc_acc {int8_ctc_acc}"
         )
 
         print(
-            "QAT: fps {}, hx_acc {}, ctc_acc {}".format(
-                quant_fps, quant_hx_acc, quant_ctc_acc
-            )
+            f"QAT: fps {quant_fps}, hx_acc {quant_hx_acc}, ctc_acc {quant_ctc_acc}"
         )
 
         sys.stdout.flush()
diff --git a/test/quantization/quant_int8_image_classification_comparison.py b/test/quantization/quant_int8_image_classification_comparison.py
index cc0a6ad32ffc2..7d04939ee3731 100644
--- a/test/quantization/quant_int8_image_classification_comparison.py
+++ b/test/quantization/quant_int8_image_classification_comparison.py
@@ -92,9 +92,7 @@ def reader():
                 while step < num:
                     fp.seek(imgs_offset + img_size * step)
                     img = fp.read(img_size)
-                    img = struct.unpack_from(
-                        '{}f'.format(img_ch * img_w * img_h), img
-                    )
+                    img = struct.unpack_from(f'{img_ch * img_w * img_h}f', img)
                     img = np.array(img)
                     img.shape = (img_ch, img_w, img_h)
                     fp.seek(labels_offset + label_size * step)
@@ -261,14 +259,10 @@ def _predict(
     def _summarize_performance(self, fp32_fps, fp32_lat, int8_fps, int8_lat):
         _logger.info('--- Performance summary ---')
         _logger.info(
-            'FP32: avg fps: {:.2f}, avg latency: {:.4f} ms'.format(
-                fp32_fps, fp32_lat
-            )
+            f'FP32: avg fps: {fp32_fps:.2f}, avg latency: {fp32_lat:.4f} ms'
         )
         _logger.info(
-            'INT8: avg fps: {:.2f}, avg latency: {:.4f} ms'.format(
-                int8_fps, int8_lat
-            )
+            f'INT8: avg fps: {int8_fps:.2f}, avg latency: {int8_lat:.4f} ms'
         )
 
     def _compare_accuracy(
diff --git a/test/quantization/test_imperative_ptq.py b/test/quantization/test_imperative_ptq.py
index 8ad6fa2832e93..189be58754c9f 100644
--- a/test/quantization/test_imperative_ptq.py
+++ b/test/quantization/test_imperative_ptq.py
@@ -91,8 +91,8 @@ def setUpClass(cls):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path
+            cmd = (
+                f'mkdir {target_folder} && tar xf {zip_path} -C {target_folder}'
             )
             os.system(cmd)
 
diff --git a/test/quantization/test_imperative_qat_amp.py b/test/quantization/test_imperative_qat_amp.py
index 4dba9c5421df4..29b7b28df5815 100644
--- a/test/quantization/test_imperative_qat_amp.py
+++ b/test/quantization/test_imperative_qat_amp.py
@@ -68,8 +68,8 @@ def tearDownClass(cls):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path
+            cmd = (
+                f'mkdir {target_folder} && tar xf {zip_path} -C {target_folder}'
             )
             os.system(cmd)
 
diff --git a/test/quantization/test_post_training_quantization_lstm_model.py b/test/quantization/test_post_training_quantization_lstm_model.py
index 81f68fd2b3986..76f861eb8bd7e 100644
--- a/test/quantization/test_post_training_quantization_lstm_model.py
+++ b/test/quantization/test_post_training_quantization_lstm_model.py
@@ -45,11 +45,7 @@ def setUp(self):
         try:
             os.system("mkdir -p " + self.int8_model_path)
         except Exception as e:
-            print(
-                "Failed to create {} due to {}".format(
-                    self.int8_model_path, str(e)
-                )
-            )
+            print(f"Failed to create {self.int8_model_path} due to {str(e)}")
             sys.exit(-1)
 
     def tearDown(self):
@@ -57,8 +53,8 @@ def tearDown(self):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path
+            cmd = (
+                f'mkdir {target_folder} && tar xf {zip_path} -C {target_folder}'
             )
             os.system(cmd)
 
@@ -253,9 +249,7 @@ def run_test(
         data_path = os.path.join(data_path, data_name)
 
         print(
-            "Start FP32 inference for {} on {} samples ...".format(
-                model_name, infer_iterations
-            )
+            f"Start FP32 inference for {model_name} on {infer_iterations} samples ..."
         )
         (fp32_latency, fp32_acc) = self.run_program(
             fp32_model_path,
@@ -287,9 +281,7 @@ def run_test(
         )
 
         print(
-            "Start INT8 inference for {} on {} samples ...".format(
-                model_name, infer_iterations
-            )
+            f"Start INT8 inference for {model_name} on {infer_iterations} samples ..."
         )
         (int8_latency, int8_acc) = self.run_program(
             self.int8_model_path,
diff --git a/test/quantization/test_post_training_quantization_mnist.py b/test/quantization/test_post_training_quantization_mnist.py
index cef001a8a60ef..2ff3f4e29ab68 100644
--- a/test/quantization/test_post_training_quantization_mnist.py
+++ b/test/quantization/test_post_training_quantization_mnist.py
@@ -61,11 +61,7 @@ def setUp(self):
             os.system("mkdir -p " + self.int8_model_path)
             os.system("mkdir -p " + self.cache_folder)
         except Exception as e:
-            print(
-                "Failed to create {} due to {}".format(
-                    self.int8_model_path, str(e)
-                )
-            )
+            print(f"Failed to create {self.int8_model_path} due to {str(e)}")
             sys.exit(-1)
 
     def tearDown(self):
@@ -73,8 +69,8 @@ def tearDown(self):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path
+            cmd = (
+                f'mkdir {target_folder} && tar xf {zip_path} -C {target_folder}'
             )
             os.system(cmd)
 
@@ -99,9 +95,7 @@ def download(self, url, dirname, md5sum, save_name=None):
                 retry += 1
             else:
                 raise RuntimeError(
-                    "Cannot download {} within retry limit {}".format(
-                        url, retry_limit
-                    )
+                    f"Cannot download {url} within retry limit {retry_limit}"
                 )
             sys.stderr.write(
                 f"Cache file {filename} not found, downloading {url} \n"
@@ -146,9 +140,7 @@ def download_model(self, data_url, data_md5, folder_name):
         file_name = data_url.split('/')[-1]
         zip_path = os.path.join(self.cache_folder, file_name)
         print(
-            'Data is downloaded at {}. File exists: {}'.format(
-                zip_path, os.path.exists(zip_path)
-            )
+            f'Data is downloaded at {zip_path}. File exists: {os.path.exists(zip_path)}'
         )
 
         data_cache_folder = os.path.join(self.cache_folder, folder_name)
@@ -164,9 +156,7 @@ def run_program(
         infer_iterations,
     ):
         print(
-            "test model path: {}. File exists: {}".format(
-                model_path, os.path.exists(model_path)
-            )
+            f"test model path: {model_path}. File exists: {os.path.exists(model_path)}"
         )
         place = paddle.CPUPlace()
         exe = paddle.static.Executor(place)
diff --git a/test/quantization/test_post_training_quantization_mobilenetv1.py b/test/quantization/test_post_training_quantization_mobilenetv1.py
index 75caf3d9c908d..4500f61ca13dc 100644
--- a/test/quantization/test_post_training_quantization_mobilenetv1.py
+++ b/test/quantization/test_post_training_quantization_mobilenetv1.py
@@ -168,8 +168,8 @@ def tearDown(self):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path
+            cmd = (
+                f'mkdir {target_folder} && tar xf {zip_path} -C {target_folder}'
             )
             os.system(cmd)
 
diff --git a/test/quantization/test_post_training_quantization_while.py b/test/quantization/test_post_training_quantization_while.py
index d515fc59cd4f9..9a169b27c513a 100644
--- a/test/quantization/test_post_training_quantization_while.py
+++ b/test/quantization/test_post_training_quantization_while.py
@@ -59,22 +59,14 @@ def setUp(self):
         try:
             os.system("mkdir -p " + self.int8_model_path)
         except Exception as e:
-            print(
-                "Failed to create {} due to {}".format(
-                    self.int8_model_path, str(e)
-                )
-            )
+            print(f"Failed to create {self.int8_model_path} due to {str(e)}")
             sys.exit(-1)
 
     def tearDown(self):
         try:
             os.system(f"rm -rf {self.int8_model_path}")
         except Exception as e:
-            print(
-                "Failed to delete {} due to {}".format(
-                    self.int8_model_path, str(e)
-                )
-            )
+            print(f"Failed to delete {self.int8_model_path} due to {str(e)}")
 
     def cache_unzipping(self, target_folder, zip_path):
         cmd = f'tar xf {zip_path} -C {target_folder}'
diff --git a/test/quantization/test_quant_post_quant_aware.py b/test/quantization/test_quant_post_quant_aware.py
index a387f8bd230ee..0fe582306fbd7 100644
--- a/test/quantization/test_quant_post_quant_aware.py
+++ b/test/quantization/test_quant_post_quant_aware.py
@@ -109,9 +109,7 @@ def train(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'train iter={}, avg loss {}, acc_top1 {}'.format(
-                            iter, cost, top1
-                        )
+                        f'train iter={iter}, avg loss {cost}, acc_top1 {top1}'
                     )
 
         def test(program):
@@ -124,16 +122,12 @@ def test(program):
                 iter += 1
                 if iter % 100 == 0:
                     logging.info(
-                        'eval iter={}, avg loss {}, acc_top1 {}'.format(
-                            iter, cost, top1
-                        )
+                        f'eval iter={iter}, avg loss {cost}, acc_top1 {top1}'
                     )
                 result[0].append(cost)
                 result[1].append(top1)
             logging.info(
-                ' avg loss {}, acc_top1 {}'.format(
-                    np.mean(result[0]), np.mean(result[1])
-                )
+                f' avg loss {np.mean(result[0])}, acc_top1 {np.mean(result[1])}'
             )
             return np.mean(result[1])
 
diff --git a/test/quantization/test_weight_quantization_mobilenetv1.py b/test/quantization/test_weight_quantization_mobilenetv1.py
index 0f8001a20edff..5ccee4b6c14bd 100644
--- a/test/quantization/test_weight_quantization_mobilenetv1.py
+++ b/test/quantization/test_weight_quantization_mobilenetv1.py
@@ -67,8 +67,8 @@ def download_model(self, model_name, data_url, data_md5):
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
-            cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path
+            cmd = (
+                f'mkdir {target_folder} && tar xf {zip_path} -C {target_folder}'
             )
             os.system(cmd)
 
diff --git a/test/rnn/rnn_numpy.py b/test/rnn/rnn_numpy.py
index dc115d0734ba5..d1a7ccbf02ecb 100644
--- a/test/rnn/rnn_numpy.py
+++ b/test/rnn/rnn_numpy.py
@@ -482,7 +482,7 @@ def __init__(
         else:
             raise ValueError(
                 "direction should be forward, backward or bidirectional, "
-                "received direction = {}".format(direction)
+                f"received direction = {direction}"
             )
 
         self.input_size = input_size
@@ -526,7 +526,7 @@ def __init__(
         else:
             raise ValueError(
                 "direction should be forward, backward or bidirectional, "
-                "received direction = {}".format(direction)
+                f"received direction = {direction}"
             )
 
         self.input_size = input_size
@@ -570,7 +570,7 @@ def __init__(
         else:
             raise ValueError(
                 "direction should be forward, backward or bidirectional, "
-                "received direction = {}".format(direction)
+                f"received direction = {direction}"
             )
 
         self.input_size = input_size
diff --git a/test/tokenizer/bert_tokenizer.py b/test/tokenizer/bert_tokenizer.py
index ac122e8c709bf..f8c7f4c55293a 100755
--- a/test/tokenizer/bert_tokenizer.py
+++ b/test/tokenizer/bert_tokenizer.py
@@ -327,11 +327,9 @@ def __init__(
     ):
         if not os.path.isfile(vocab_file):
             raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the "
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the "
                 "vocabulary from a pretrained model please use "
-                "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    vocab_file
-                )
+                "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
             )
         self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
         self.do_lower_case = do_lower_case
diff --git a/test/tokenizer/tokenizer_utils.py b/test/tokenizer/tokenizer_utils.py
index 467d5e7cf0ced..5ce07e29b6d72 100644
--- a/test/tokenizer/tokenizer_utils.py
+++ b/test/tokenizer/tokenizer_utils.py
@@ -564,9 +564,7 @@ def save_pretrained(self, save_directory):
         """
         assert not os.path.isfile(
             save_directory
-        ), "Saving directory ({}) should be a directory, not a file".format(
-            save_directory
-        )
+        ), f"Saving directory ({save_directory}) should be a directory, not a file"
         os.makedirs(save_directory, exist_ok=True)
 
         tokenizer_config_file = os.path.join(
@@ -632,9 +630,7 @@ def __getattr__(self, name):
         elif name.endswith('_token_id'):
             return self.vocab[self.special_tokens_map[name[:-3]]]
         raise AttributeError(
-            "'{}' object has no attribute '{}'".format(
-                type(self).__name__, name
-            )
+            f"'{type(self).__name__}' object has no attribute '{name}'"
         )
 
     def truncate_sequences(
diff --git a/test/xpu/test_generate_proposals_v2_op_xpu.py b/test/xpu/test_generate_proposals_v2_op_xpu.py
index 024d09603b7d9..dca37a4cd2e73 100644
--- a/test/xpu/test_generate_proposals_v2_op_xpu.py
+++ b/test/xpu/test_generate_proposals_v2_op_xpu.py
@@ -105,9 +105,7 @@ def clip_tiled_boxes(boxes, im_shape, pixel_offset=True):
     has shape (N, 4 * num_tiled_boxes)."""
     assert (
         boxes.shape[1] % 4 == 0
-    ), 'boxes.shape[1] is {:d}, but must be divisible by 4.'.format(
-        boxes.shape[1]
-    )
+    ), f'boxes.shape[1] is {boxes.shape[1]:d}, but must be divisible by 4.'
     offset = 1 if pixel_offset else 0
     # x1 >= 0
     boxes[:, 0::4] = np.maximum(
diff --git a/test/xpu/test_tril_triu_op_xpu.py b/test/xpu/test_tril_triu_op_xpu.py
index 15371d894fa8d..c9279a391be7e 100644
--- a/test/xpu/test_tril_triu_op_xpu.py
+++ b/test/xpu/test_tril_triu_op_xpu.py
@@ -134,9 +134,7 @@ def test_errors1(self):
         data = paddle.static.data(shape=(20, 22), dtype='float32', name="data1")
         op_type = np.random.choice(['triu', 'tril'])
         errmsg = {
-            "diagonal: TypeError": "diagonal in {} must be a python Int".format(
-                op_type
-            ),
+            "diagonal: TypeError": f"diagonal in {op_type} must be a python Int",
         }
         expected = list(errmsg.keys())[0]
         with self.assertRaisesRegex(
@@ -149,9 +147,7 @@ def test_errors2(self):
         data = paddle.static.data(shape=(200,), dtype='float32', name="data2")
         op_type = np.random.choice(['triu', 'tril'])
         errmsg = {
-            "input: ValueError": "x shape in {} must be at least 2-D".format(
-                op_type
-            ),
+            "input: ValueError": f"x shape in {op_type} must be at least 2-D",
         }
         expected = list(errmsg.keys())[0]
         with self.assertRaisesRegex(
diff --git a/tools/analysisPyXml.py b/tools/analysisPyXml.py
index f6c86619998aa..2f2d8b472c566 100644
--- a/tools/analysisPyXml.py
+++ b/tools/analysisPyXml.py
@@ -22,16 +22,8 @@
 
 def analysisPyXml(rootPath, ut):
     xml_path = f'{rootPath}/build/pytest/{ut}/python-coverage.xml'
-    related_ut_map_file = '{}/build/ut_map/{}/related_{}.txt'.format(
-        rootPath,
-        ut,
-        ut,
-    )
-    notrelated_ut_map_file = '{}/build/ut_map/{}/notrelated_{}.txt'.format(
-        rootPath,
-        ut,
-        ut,
-    )
+    related_ut_map_file = f'{rootPath}/build/ut_map/{ut}/related_{ut}.txt'
+    notrelated_ut_map_file = f'{rootPath}/build/ut_map/{ut}/notrelated_{ut}.txt'
     tree = ElementTree.parse(xml_path)
     root = tree.getroot()
     error_files = []
diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py
index 8db2e850ae290..335f7715489b8 100644
--- a/tools/check_op_benchmark_result.py
+++ b/tools/check_op_benchmark_result.py
@@ -72,7 +72,7 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result):
     develop_gpu_time = develop_data.get("gpu_time")
     if develop_gpu_time != 0.0:
         gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time
-        gpu_time_diff_str = "{:.5f}".format(gpu_time_diff * 100)
+        gpu_time_diff_str = f"{gpu_time_diff * 100:.5f}"
     else:
         gpu_time_diff = 0
         gpu_time_diff_str = ""
diff --git a/tools/check_op_desc.py b/tools/check_op_desc.py
index 89a5e87af0b45..2eb8df32cc7c0 100644
--- a/tools/check_op_desc.py
+++ b/tools/check_op_desc.py
@@ -343,16 +343,12 @@ def print_desc_error_message(error_message):
 
         for name in Inputs_error.get(QUANT, {}):
             print(
-                " * The added Input '{}' is `quant`, need slim to review.".format(
-                    name
-                )
+                f" * The added Input '{name}' is `quant`, need slim to review."
             )
 
         for name in Inputs_error.get(DEF, {}):
             print(
-                " * The added Input '{}' is `def`, need inference to review.".format(
-                    name
-                )
+                f" * The added Input '{name}' is `def`, need inference to review."
             )
 
         # 2. print outputs error message
@@ -375,16 +371,12 @@ def print_desc_error_message(error_message):
 
         for name in Outputs_error.get(QUANT, {}):
             print(
-                " * The added Output '{}' is `quant`, need slim to review.".format(
-                    name
-                )
+                f" * The added Output '{name}' is `quant`, need slim to review."
             )
 
         for name in Outputs_error.get(DEF, {}):
             print(
-                " * The added Output '{}' is `def`, need inference to review.".format(
-                    name
-                )
+                f" * The added Output '{name}' is `def`, need inference to review."
             )
 
         # 3. print attrs error message
@@ -408,17 +400,13 @@ def print_desc_error_message(error_message):
         for name in attrs_error.get(QUANT, {}):
             # TODO(Wilber):
             print(
-                " * The added attr '{}' is `quant`, need slim to review.".format(
-                    name
-                )
+                f" * The added attr '{name}' is `quant`, need slim to review."
             )
 
         for name in attrs_error.get(DEF, {}):
             # TODO(Wilber):
             print(
-                " * The added attr '{}' is `def`, need inference to review.".format(
-                    name
-                )
+                f" * The added attr '{name}' is `def`, need inference to review."
             )
 
 
@@ -435,22 +423,14 @@ def print_version_error_message(error_message):
         error_list = inputs_error.get(ADD, [])
         if error_list:
             for tup in error_list:
-                print(
-                    " * The added input '{}' is not yet registered.".format(
-                        tup[1]
-                    )
-                )
+                print(f" * The added input '{tup[1]}' is not yet registered.")
 
         # 2. print outputs error message
         outputs_error = error_message.get(op_name, {}).get(OUTPUTS, {})
         error_list = outputs_error.get(ADD, [])
         if error_list:
             for tup in error_list:
-                print(
-                    " * The added output '{}' is not yet registered.".format(
-                        tup[1]
-                    )
-                )
+                print(f" * The added output '{tup[1]}' is not yet registered.")
 
         # 3. print attrs error message
         attrs_error = error_message.get(op_name, {}).get(ATTRS, {})
@@ -458,19 +438,13 @@ def print_version_error_message(error_message):
         if error_list:
             for tup in error_list:
                 print(
-                    " * The added attribute '{}' is not yet registered.".format(
-                        tup[1]
-                    )
+                    f" * The added attribute '{tup[1]}' is not yet registered."
                 )
         error_dic = (
             error_message.get(op_name, {}).get(ATTRS, {}).get(CHANGE, {})
         )
         for key, val in error_dic.items():
-            print(
-                " * The change of attribute '{}' is not yet registered.".format(
-                    key
-                )
-            )
+            print(f" * The change of attribute '{key}' is not yet registered.")
 
 
 def print_repeat_process():
diff --git a/tools/count_api_without_core_ops.py b/tools/count_api_without_core_ops.py
index 90d5e48f5167c..b90bdf0ca0e9b 100644
--- a/tools/count_api_without_core_ops.py
+++ b/tools/count_api_without_core_ops.py
@@ -38,9 +38,7 @@ def md5(doc):
     except UnicodeDecodeError as e:
         md5sum = None
         print(
-            "Error({}) occurred when `md5({})`, discard it.".format(
-                str(e), doc
-            ),
+            f"Error({str(e)}) occurred when `md5({doc})`, discard it.",
             file=sys.stderr,
         )
     return md5sum
@@ -111,9 +109,7 @@ def visit_member(parent_name, member, func):
         return
     else:
         raise RuntimeError(
-            "Unsupported generate signature of member, type {}".format(
-                str(type(member))
-            )
+            f"Unsupported generate signature of member, type {str(type(member))}"
         )
 
 
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index 96f5b54a854ee..2abba39636d07 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -39,11 +39,7 @@ def get_pull(pull_id):
             repo = github.get_repo('PaddlePaddle/Paddle')
         except Exception as e:
             print(e)
-            print(
-                "get_repo error, retry {} times after {} secs.".format(
-                    idx, idx * 10
-                )
-            )
+            print(f"get_repo error, retry {idx} times after {idx * 10} secs.")
         else:
             break
         idx += 1
diff --git a/tools/coverage/python_coverage.py b/tools/coverage/python_coverage.py
index 7132b119b4c0e..b28e3ef08b9d1 100644
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
@@ -61,20 +61,12 @@
             taken = int(taken)
 
             for _ in range(taken):
-                print(
-                    'BRDA:{},{},{},{}'.format(
-                        line_number, 0, branch_index, line_hits
-                    )
-                )
+                print(f'BRDA:{line_number},{0},{branch_index},{line_hits}')
                 branch_index += 1
 
             if line_missing_branches:
                 for missing_branch in line_missing_branches.split(','):
-                    print(
-                        'BRDA:{},{},{},{}'.format(
-                            line_number, 0, branch_index, 0
-                        )
-                    )
+                    print(f'BRDA:{line_number},{0},{branch_index},{0}')
                     branch_index += 1
 
         print(f'DA:{line_number},{line_hits}')
diff --git a/tools/externalError/spider.py b/tools/externalError/spider.py
index 7f57f81b584e5..a078cd2debbe8 100644
--- a/tools/externalError/spider.py
+++ b/tools/externalError/spider.py
@@ -361,10 +361,7 @@ def handle_data(self, data):
                     status, code, desc = re.split('=|//', line.strip())
                     _Messages = allMessageDesc.messages.add()
                     _Messages.code = int(code.strip(' ,'))
-                    _Messages.message = "'{}'. {}".format(
-                        status.strip(),
-                        desc.strip(),
-                    )
+                    _Messages.message = f"'{status.strip()}'. {desc.strip()}"
 
     CUFFTHTMLParser().feed(html)
 
diff --git a/tools/get_single_test_cov.py b/tools/get_single_test_cov.py
index a111ea61c6c89..a710e7792e4a5 100644
--- a/tools/get_single_test_cov.py
+++ b/tools/get_single_test_cov.py
@@ -67,15 +67,9 @@ def getFNDAFile(rootPath, test):
 
 
 def analysisFNDAFile(rootPath, test):
-    related_ut_map_file = '{}/build/ut_map/{}/related_{}.txt'.format(
-        rootPath,
-        test,
-        test,
-    )
-    notrelated_ut_map_file = '{}/build/ut_map/{}/notrelated_{}.txt'.format(
-        rootPath,
-        test,
-        test,
+    related_ut_map_file = f'{rootPath}/build/ut_map/{test}/related_{test}.txt'
+    notrelated_ut_map_file = (
+        f'{rootPath}/build/ut_map/{test}/notrelated_{test}.txt'
     )
     os.system('touch %s' % related_ut_map_file)
     os.system('touch %s' % notrelated_ut_map_file)
diff --git a/tools/parse_kernel_info.py b/tools/parse_kernel_info.py
index 23106ab0d2ebb..19a70bbb22e33 100644
--- a/tools/parse_kernel_info.py
+++ b/tools/parse_kernel_info.py
@@ -80,7 +80,7 @@ def __str__(self):
                     percent = float(self.num_ops_for_dtypes[dtype]) / float(
                         num_floats
                     )
-                    res += "({:.2f}%)".format(percent * 100)
+                    res += f"({percent * 100:.2f}%)"
                 else:
                     res += f"({0:.2f}%)"
             res += " "
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index cdae91ece7023..ff03a33dc2e85 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -54,9 +54,7 @@ def md5(doc):
     except UnicodeDecodeError as e:
         md5sum = None
         print(
-            "Error({}) occurred when `md5({})`, discard it.".format(
-                str(e), doc
-            ),
+            f"Error({str(e)}) occurred when `md5({doc})`, discard it.",
             file=sys.stderr,
         )
 
@@ -319,9 +317,7 @@ def check_public_api():
             cur_name = module + '.' + member_name
             instance = eval(cur_name)
             doc_md5 = md5(instance.__doc__)
-            member_dict[cur_name] = "({}, ('document', '{}'))".format(
-                cur_name, doc_md5
-            )
+            member_dict[cur_name] = f"({cur_name}, ('document', '{doc_md5}'))"
 
 
 def check_allmodule_callable():
diff --git a/tools/sampcd_processor_utils.py b/tools/sampcd_processor_utils.py
index 922d20ff5c8fb..a9ac35c0a7336 100644
--- a/tools/sampcd_processor_utils.py
+++ b/tools/sampcd_processor_utils.py
@@ -187,7 +187,7 @@ def __init__(self, **kwargs) -> None:
         for name, value in kwargs.items():
             # check attr name
             if not (hasattr(self, name) or name in MetaResult.cls_map()):
-                raise KeyError('`{}` is not a valid result type.'.format(name))
+                raise KeyError(f'`{name}` is not a valid result type.')
 
             setattr(self, name, value)
 
@@ -207,7 +207,7 @@ def state(self) -> Result:
         return self.__unique_state
 
     def __str__(self) -> str:
-        return '{}, running time: {:.3f}s'.format(self.name, self.time)
+        return f'{self.name}, running time: {self.time:.3f}s'
 
 
 class DocTester:
@@ -654,9 +654,7 @@ def check_old_style(docstrings_to_test: typing.Dict[str, str]):
                 codeblock_name = codeblock['name']
                 codeblock_id = codeblock['id']
 
-                docstring_name = '{}:{}'.format(
-                    api_name, codeblock_name or codeblock_id
-                )
+                docstring_name = f'{api_name}:{codeblock_name or codeblock_id}'
 
                 old_style_apis.append(docstring_name)
 
@@ -738,9 +736,7 @@ def get_test_results(
                 docstring = doctester.ensemble_docstring(
                     codeblock=codeblock['codes']
                 )
-                docstring_name = '{}:{}'.format(
-                    api_name, codeblock_name or codeblock_id
-                )
+                docstring_name = f'{api_name}:{codeblock_name or codeblock_id}'
 
                 docstrings_extracted.append(
                     {'name': docstring_name, 'docstring': docstring}

From 4377a448a3d657f9ab0a4ae606085fc16af15e45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:16:06 +0800
Subject: [PATCH 050/115] [CodeStyle][task 1] enable Ruff UP032 rule in
 `python/paddle/base` (#57408)

* base up032

* update up032

* Apply suggestions from code review
---
 pyproject.toml                                |   1 -
 python/paddle/base/__init__.py                |   4 +-
 python/paddle/base/backward.py                |  12 +-
 python/paddle/base/core.py                    |   2 +-
 python/paddle/base/dygraph/base.py            |   8 +-
 python/paddle/base/executor.py                |  14 +-
 python/paddle/base/framework.py               | 139 +++++++-----------
 .../incubate/checkpoint/auto_checkpoint.py    |  68 ++++-----
 .../incubate/checkpoint/checkpoint_saver.py   |  22 +--
 python/paddle/base/layer_helper.py            |   2 +-
 python/paddle/base/layer_helper_base.py       |  12 +-
 .../base/layers/layer_function_generator.py   |  26 ++--
 python/paddle/base/layers/math_op_patch.py    |  16 +-
 python/paddle/base/param_attr.py              |   2 +-
 python/paddle/base/reader.py                  |   8 +-
 python/paddle/base/trainer_factory.py         |   8 +-
 python/paddle/base/variable_index.py          |  32 ++--
 17 files changed, 135 insertions(+), 241 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9b247f4a738a9..372895a3f02a1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -105,7 +105,6 @@ ignore = [
 
 # Temporarily ignored
 "python/paddle/base/**" = [
-    "UP032",
     "UP031",
     "C408",
     "UP030",
diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index acc6f9f51ae2f..857d6d58e4718 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -188,10 +188,10 @@ def __bootstrap__():
 
     if num_threads > 1:
         print(
-            'WARNING: OMP_NUM_THREADS set to {0}, not 1. The computation '
+            f'WARNING: OMP_NUM_THREADS set to {num_threads}, not 1. The computation '
             'speed will not be optimized if you use data parallel. It will '
             'fail if this PaddlePaddle binary is compiled with OpenBlas since'
-            ' OpenBlas does not support multi-threads.'.format(num_threads),
+            ' OpenBlas does not support multi-threads.',
             file=sys.stderr,
         )
         print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 1f3f67a98b640..a6786848ddb85 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -995,9 +995,7 @@ def _append_backward_ops_with_checkpoints_(
                 segments.append([min_idx, max_idx + 1])
             else:
                 _logger.info(
-                    "Could not recompute op range [{}] - [{}] ".format(
-                        min_idx, max_idx + 1
-                    )
+                    f"Could not recompute op range [{min_idx}] - [{max_idx + 1}] "
                 )
 
             start_idx += 1
@@ -1008,7 +1006,7 @@ def _append_backward_ops_with_checkpoints_(
         recompute_segments = segments
 
     for i, (idx1, idx2) in enumerate(recompute_segments):
-        _logger.info("recompute segment[{}]".format(i))
+        _logger.info(f"recompute segment[{i}]")
         _logger.info(
             "segment start op: [{}]: [{}]".format(
                 ops[idx1].desc.type(), ops[idx1].desc.input_arg_names()
@@ -1019,7 +1017,7 @@ def _append_backward_ops_with_checkpoints_(
                 ops[idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()
             )
         )
-        _logger.info("recompute segment[{}]".format(i))
+        _logger.info(f"recompute segment[{i}]")
         _logger.info(
             "segment start op: [{}]: [{}]".format(
                 ops[idx1].desc.type(), ops[idx1].desc.input_arg_names()
@@ -2193,9 +2191,7 @@ def append_backward(
         grad_block = grad_info[1]
         if not grad_block.has_var(grad_info[0]):
             raise ValueError(
-                "grad block[{0}] did not have grad var {1}".format(
-                    grad_info[1], grad_info[0]
-                )
+                f"grad block[{grad_info[1]}] did not have grad var {grad_info[0]}"
             )
         # Get the param var from the global block
         param_var = program.global_block().var(param)
diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py
index 285a9f1b1a61b..158e556cd1afe 100644
--- a/python/paddle/base/core.py
+++ b/python/paddle/base/core.py
@@ -210,7 +210,7 @@ def load_dso(dso_absolute_path):
 
             cdll.LoadLibrary(dso_absolute_path)
         except:
-            warnings.warn("Load {} failed".format(dso_absolute_path))
+            warnings.warn(f"Load {dso_absolute_path} failed")
 
 
 def pre_load(dso_name):
diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py
index d85fc8ca25bf7..22a63ff3c0190 100644
--- a/python/paddle/base/dygraph/base.py
+++ b/python/paddle/base/dygraph/base.py
@@ -747,19 +747,19 @@ def test_dygraph_grad(grad_outputs=None):
         return gradients(outputs, inputs, grad_outputs, no_grad_vars)
 
     def check_in_out(in_out_list, name):
-        assert in_out_list is not None, "{} should not be None".format(name)
+        assert in_out_list is not None, f"{name} should not be None"
 
         if isinstance(in_out_list, (list, tuple)):
-            assert len(in_out_list) > 0, "{} cannot be empty".format(name)
+            assert len(in_out_list) > 0, f"{name} cannot be empty"
             for each_var in in_out_list:
                 assert isinstance(
                     each_var, core.eager.Tensor
-                ), "Elements of {} must be Tensor".format(name)
+                ), f"Elements of {name} must be Tensor"
             return in_out_list
         else:
             assert isinstance(
                 in_out_list, core.eager.Tensor
-            ), "{} must be Tensor or list of Tensor".format(name)
+            ), f"{name} must be Tensor or list of Tensor"
             return [in_out_list]
 
     outputs = check_in_out(outputs, 'outputs')
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 0921d7b79d14b..9fd421f54cb4c 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -340,9 +340,7 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
             feed_target_name = op.desc.output('Out')[0]
             if feed_target_name not in feed_targets:
                 raise Exception(
-                    "'feed_targets' does not have {} variable".format(
-                        feed_target_name
-                    )
+                    f"'feed_targets' does not have {feed_target_name} variable"
                 )
         else:
             break
@@ -387,9 +385,7 @@ def has_fetch_operators(
                 var.desc.name() for var in fetch_targets
             ]:
                 raise Exception(
-                    "'fetch_targets' does not have {} variable".format(
-                        fetch_target_name
-                    )
+                    f"'fetch_targets' does not have {fetch_target_name} variable"
                 )
             idx = op.desc.attr('col')
             assert fetch_target_name == fetch_targets[idx].desc.name()
@@ -710,9 +706,7 @@ def _as_lodtensor(data, place, dtype=None):
             data = data.astype(dtype)
         else:
             raise TypeError(
-                "Convert data of type {} to Tensor is not supported".format(
-                    type(data)
-                )
+                f"Convert data of type {type(data)} to Tensor is not supported"
             )
 
     # convert numpy.ndarray to tensor
@@ -752,7 +746,7 @@ def __init__(self, var_dict=None, period_secs=60):
     def handler(self, res_dict):
         for key in res_dict:
             if type(res_dict[key]) is np.ndarray:
-                sys.stdout.write("{}[0]: {} ".format(key, res_dict[key][0]))
+                sys.stdout.write(f"{key}[0]: {res_dict[key][0]} ")
         sys.stdout.write("\n")
 
     @staticmethod
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 83f3ee734b8f2..3c99297c20875 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -1235,9 +1235,7 @@ def _debug_string_(proto, throw_on_error=True):
     error_fields = list()
     if not proto.IsInitialized(error_fields) and throw_on_error:
         raise ValueError(
-            "{0} are not initialized.\nThe message is {1}:\n".format(
-                error_fields, proto
-            )
+            f"{error_fields} are not initialized.\nThe message is {proto}:\n"
         )
     return proto.__str__()
 
@@ -1296,7 +1294,7 @@ def wrap_as_scalar(number):
         # it is a numpy scalar
         return core.Scalar(number.item())
     else:
-        raise TypeError("Cannot wrap {} as core.Scalar".format(number))
+        raise TypeError(f"Cannot wrap {number} as core.Scalar")
 
 
 def wrap_as_scalars(array):
@@ -1486,9 +1484,9 @@ def __init__(
             self.desc.set_type(type)
         elif self.desc.type() != type:
             raise ValueError(
-                "Variable '{0}' has been created before. The "
-                "previous type is {1}, the new type is {2}. They"
-                " are not matched".format(self.name, self.desc.type(), type)
+                f"Variable '{self.name}' has been created before. The "
+                f"previous type is {self.desc.type()}, the new type is {type}. They"
+                " are not matched"
             )
 
         if shape is not None:
@@ -1499,9 +1497,9 @@ def __init__(
                 shape = tuple(shape)
                 if shape != old_shape:
                     raise ValueError(
-                        "Variable '{0}' has been created before. The previous "
-                        "shape is {1}, the new shape is {2}. They are not "
-                        "matched.".format(self.name, old_shape, shape)
+                        f"Variable '{self.name}' has been created before. The previous "
+                        f"shape is {old_shape}, the new shape is {shape}. They are not "
+                        "matched."
                     )
         if dtype is not None:
             if is_new_var:
@@ -1510,10 +1508,10 @@ def __init__(
                 old_dtype = self.dtype
                 if dtype != old_dtype:
                     raise ValueError(
-                        "Variable '{0}' has been created before. "
-                        "The previous data type is {1}, the new "
-                        "data type is {2}. They are not "
-                        "matched.".format(self.name, old_dtype, dtype)
+                        f"Variable '{self.name}' has been created before. "
+                        f"The previous data type is {old_dtype}, the new "
+                        f"data type is {dtype}. They are not "
+                        "matched."
                     )
 
         if lod_level is not None:
@@ -1522,10 +1520,10 @@ def __init__(
             else:
                 if lod_level != self.lod_level:
                     raise ValueError(
-                        "Variable '{0}' has been created before. "
-                        "The previous lod_level is {1}, the new "
-                        "lod_level is {2}. They are not "
-                        "matched".format(self.name, self.lod_level, lod_level)
+                        f"Variable '{self.name}' has been created before. "
+                        f"The previous lod_level is {self.lod_level}, the new "
+                        f"lod_level is {lod_level}. They are not "
+                        "matched"
                     )
         if persistable is not None:
             if is_new_var:
@@ -1533,11 +1531,9 @@ def __init__(
             else:
                 if persistable != self.persistable:
                     raise ValueError(
-                        "Variable '{0}' has been created before."
-                        "The previous persistable is {1}, the new "
-                        "persistable is {2}. They are not matched".format(
-                            self.name, self.persistable, persistable
-                        )
+                        f"Variable '{self.name}' has been created before."
+                        f"The previous persistable is {self.persistable}, the new "
+                        f"persistable is {persistable}. They are not matched"
                     )
 
         if need_check_feed and is_new_var:
@@ -1832,7 +1828,7 @@ def _to_readable_code(self):
                 stop_gradient=self.stop_gradient,
             )
         else:
-            var_str = "{name} : {type})".format(name=self.name, type=type_str)
+            var_str = f"{self.name} : {type_str})"
 
         if self.is_parameter:
             if self.trainable:
@@ -2549,7 +2545,7 @@ def get_value(self, scope=None):
         var_temp = scope.find_var(self.name)
         if var_temp is None:
             raise ValueError(
-                "Can not find Variable '{}' in the Scope.".format(self.name)
+                f"Can not find Variable '{self.name}' in the Scope."
             )
         t = var_temp.get_tensor()
         return t
@@ -2624,7 +2620,7 @@ def set_value(self, value, scope=None):
         var_temp = scope.find_var(self.name)
         if var_temp is None:
             raise ValueError(
-                "Can not find Variable '{}' in the Scope.".format(self.name)
+                f"Can not find Variable '{self.name}' in the Scope."
             )
 
         t = var_temp.get_tensor()
@@ -2982,13 +2978,9 @@ def __init__(
                 op_attrs[callstack_var_name] = []
                 for frame in traceback.extract_stack():
                     op_attrs[callstack_var_name].append(
-                        '  File "{}", line {}, in {}'.format(
-                            frame[0], frame[1], frame[2]
-                        )
-                    )
-                    op_attrs[callstack_var_name].append(
-                        '    {}'.format(frame[3])
+                        f'  File "{frame[0]}", line {frame[1]}, in {frame[2]}'
                     )
+                    op_attrs[callstack_var_name].append(f'    {frame[3]}')
 
             self.desc.set_type(type)
             proto = OpProtoHolder.instance().get_op_proto(type)
@@ -3036,7 +3028,7 @@ def find_name(var_list, name):
                     found = find_name(inputs, in_proto.name)
                     assert (
                         found or in_proto.dispensable
-                    ), "Input {} not found".format(in_proto.name)
+                    ), f"Input {in_proto.name} not found"
                     if found:
                         in_args = inputs[in_proto.name]
                         if not isinstance(in_args, (list, tuple)):
@@ -3249,18 +3241,18 @@ def _to_readable_code(self, skip_op_callstack=True):
         )
         outputs_str = "{"
         for i in range(0, len(self.output_names)):
-            outputs_str += "{name}=".format(name=self.output_names[i])
+            outputs_str += f"{self.output_names[i]}="
             o = self.output(self.output_names[i])
-            outputs_str += "{value}".format(value=o)
+            outputs_str += f"{o}"
             if i != len(self.output_names) - 1:
                 outputs_str += ", "
         outputs_str += "}"
 
         inputs_str = "{"
         for i in range(0, len(self.input_names)):
-            inputs_str += "{name}=".format(name=self.input_names[i])
+            inputs_str += f"{self.input_names[i]}="
             o = self.input(self.input_names[i])
-            inputs_str += "{value}".format(value=o)
+            inputs_str += f"{o}"
 
             if i != len(self.input_names) - 1:
                 inputs_str += ", "
@@ -3276,9 +3268,7 @@ def _to_readable_code(self, skip_op_callstack=True):
             attr_type = self.desc.attr_type(name, True)
             if attr_type == core.AttrType.VAR:
                 attr_var_name = self.desc.attr(name, True).name()
-                a = "{name} = Var['{value}']".format(
-                    name=name, value=attr_var_name
-                )
+                a = f"{name} = Var['{attr_var_name}']"
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
@@ -3297,18 +3287,14 @@ def _to_readable_code(self, skip_op_callstack=True):
                 continue
 
             if attr_type == core.AttrType.BLOCK:
-                a = "{name} = block[{value}]".format(
-                    name=name, value=self._block_attr_id(name)
-                )
+                a = f"{name} = block[{self._block_attr_id(name)}]"
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
                 continue
 
             if attr_type == core.AttrType.BLOCKS:
-                a = "{name} = blocks{value}".format(
-                    name=name, value=self._blocks_attr_ids(name)
-                )
+                a = f"{name} = blocks{self._blocks_attr_ids(name)}"
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
@@ -3331,7 +3317,7 @@ def _to_readable_code(self, skip_op_callstack=True):
             else:
                 value = self.desc.attr(name)
 
-            a = "{name} = {value}".format(name=name, value=value)
+            a = f"{name} = {value}"
 
             attrs_str += a
             if i != len(attr_names) - 1:
@@ -3349,16 +3335,11 @@ def _to_readable_code(self, skip_op_callstack=True):
             )
 
         if outputs_str != "{}":
-            op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".format(
-                outputs=outputs_str,
-                op_type=self.type,
-                inputs=inputs_str,
-                attrs=attrs_str,
+            op_str = (
+                f"{outputs_str} = {self.type}(inputs={inputs_str}, {attrs_str})"
             )
         else:
-            op_str = "{op_type}(inputs={inputs}, {attrs})".format(
-                op_type=self.type, inputs=inputs_str, attrs=attrs_str
-            )
+            op_str = f"{self.type}(inputs={inputs_str}, {attrs_str})"
         return op_str
 
     def __str__(self):
@@ -3641,9 +3622,7 @@ def _var_attr(self, name):
         attr_type = self.desc.attr_type(name, True)
         assert (
             attr_type == core.AttrType.VAR
-        ), "Required type attr({}) is Variable, but received {}".format(
-            name, attr_type
-        )
+        ), f"Required type attr({name}) is Variable, but received {attr_type}"
         attr_var_name = self.desc.attr(name, True).name()
         return self.block._var_recursive(attr_var_name)
 
@@ -3660,9 +3639,7 @@ def _vars_attr(self, name):
         attr_type = self.desc.attr_type(name, True)
         assert (
             attr_type == core.AttrType.VARS
-        ), "Required type attr({}) is list[Variable], but received {}".format(
-            name, attr_type
-        )
+        ), f"Required type attr({name}) is list[Variable], but received {attr_type}"
         attr_vars = [
             self.block._var_recursive(var.name())
             for var in self.desc.attr(name, True)
@@ -4033,14 +4010,12 @@ def _to_readable_code(self, skip_op_callstack=True):
             type(skip_op_callstack)
         )
         block_str = "{ // block "
-        block_str += "{}\n".format(self.idx)
+        block_str += f"{self.idx}\n"
         for var in list(self.vars.values()):
-            block_str += "    {}\n".format(var._to_readable_code())
+            block_str += f"    {var._to_readable_code()}\n"
         block_str += "\n"
         for op in self.ops:
-            block_str += "    {}\n".format(
-                op._to_readable_code(skip_op_callstack)
-            )
+            block_str += f"    {op._to_readable_code(skip_op_callstack)}\n"
         block_str += "}"
         return block_str
 
@@ -4194,7 +4169,7 @@ def _var_recursive(self, name):
         if var:
             return var
         else:
-            raise ValueError("Var {0} is not found recursively".format(name))
+            raise ValueError(f"Var {name} is not found recursively")
 
     def all_parameters(self):
         return list(self.iter_parameters())
@@ -5547,9 +5522,7 @@ def _convert_to_pdf(dot_file_path):
             )
             if exited_code != 0:
                 print('The dot command is needed for creating pdf files.')
-                print(
-                    'The {} is saved as the dot filetype.'.format(dot_file_path)
-                )
+                print(f'The {dot_file_path} is saved as the dot filetype.')
 
         remove_ctr_vars = set()
         if remove_ctr_var:
@@ -5557,7 +5530,7 @@ def _convert_to_pdf(dot_file_path):
                 if node.is_ctrl_var():
                     remove_ctr_vars.add(node)
             self.safe_remove_nodes(remove_ctr_vars)
-        print('Total ops num = {}.'.format(len(self.all_op_nodes())))
+        print(f'Total ops num = {len(self.all_op_nodes())}.')
 
         if marked_nodes is not None:
             if not isinstance(marked_nodes, set):
@@ -7124,9 +7097,7 @@ def state_dict(self, mode='all', scope=None):
 
         if not isinstance(mode, str):
             raise TypeError(
-                "Type of `mode` should be string, but received {}.".format(
-                    type(mode)
-                )
+                f"Type of `mode` should be string, but received {type(mode)}."
             )
 
         def is_parameter(var):
@@ -7219,9 +7190,7 @@ def set_state_dict(self, state_dict, scope=None):
 
         if not isinstance(state_dict, dict):
             raise TypeError(
-                "Type of `state_dict` should be dict, but received {}.".format(
-                    type(state_dict)
-                )
+                f"Type of `state_dict` should be dict, but received {type(state_dict)}."
             )
 
         vars_dict = {var.name: var for var in self.list_vars()}
@@ -7238,18 +7207,12 @@ def set_state_dict(self, state_dict, scope=None):
                 try:
                     vars_dict[name].set_value(value, scope)
                 except ValueError as err:
-                    warnings.warn(
-                        "Skip loading for '{}'. ".format(name) + str(err)
-                    )
+                    warnings.warn(f"Skip loading for '{name}'. " + str(err))
                 except TypeError as err:
-                    warnings.warn(
-                        "Skip loading for '{}'. ".format(name) + str(err)
-                    )
+                    warnings.warn(f"Skip loading for '{name}'. " + str(err))
             else:
                 warnings.warn(
-                    "Skip loading for '{0}'. Because '{0}' not in the program.".format(
-                        name
-                    )
+                    f"Skip loading for '{name}'. Because '{name}' not in the program."
                 )
 
 
@@ -7512,9 +7475,7 @@ def __str__(self):
                  [-0.70368278,  0.52986908, -0.68742192],
                  [-0.54217887,  0.48439729,  0.34082305]])
         """
-        return "Parameter containing:\n{tensor}".format(
-            tensor=super().__str__()
-        )
+        return f"Parameter containing:\n{super().__str__()}"
 
     def __deepcopy__(self, memo):
         """
diff --git a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
index e8f75f3a4ed55..9bf737fb055dc 100644
--- a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
@@ -115,21 +115,17 @@ def __init__(self):
                 ), "hdfs environ must set"
 
         except Exception as e:
-            logger.fatal("exception:{}".format(e))
+            logger.fatal(f"exception:{e}")
             sys.exit(1)
 
     def get_range_checkpoint_path(self, name):
-        return "{}/{}/range/{}".format(
-            self.hdfs_checkpoint_path, self.job_id, name
-        )
+        return f"{self.hdfs_checkpoint_path}/{self.job_id}/range/{name}"
 
     def get_exe_checkpoint_path(self, name):
-        return "{}/{}/exe/{}".format(
-            self.hdfs_checkpoint_path, self.job_id, name
-        )
+        return f"{self.hdfs_checkpoint_path}/{self.job_id}/exe/{name}"
 
     def get_job_path(self):
-        return "{}/{}".format(self.hdfs_checkpoint_path, self.job_id)
+        return f"{self.hdfs_checkpoint_path}/{self.job_id}"
 
     @property
     def save_checkpoint_inter(self):
@@ -235,7 +231,7 @@ def __ne__(self, t):
         return not self == t
 
     def serialize(self, path):
-        file_name = "{}/{}".format(path, self._file_name)
+        file_name = f"{path}/{self._file_name}"
         with open(file_name, 'w') as f:
             s = self._serialize()
             f.write(s)
@@ -248,7 +244,7 @@ def _serialize(self, pop_keys=["restored_from"]):
 
     def deserialize(self, path):
         d = None
-        file_name = "{}/{}".format(path, self._file_name)
+        file_name = f"{path}/{self._file_name}"
         with open(file_name, 'r') as f:
             s = f.read()
             self._deserialize(s)
@@ -297,7 +293,7 @@ def __init__(
             self._save_checkpoint_inter = self._checker.save_checkpoint_inter
         assert (
             self._save_checkpoint_inter >= 0
-        ), "checkpointer:{} must >=0".format(self._save_checkpoint_inter)
+        ), f"checkpointer:{self._save_checkpoint_inter} must >=0"
         self._last_checkpoint_time = time.time()
 
         self._load_cp_nos = None
@@ -344,7 +340,7 @@ def _look_for_valid(self, cp_nos):
                 local_cache_path=self._checker._fs_cache,
             )
             cps.append(t)
-            logger.debug("look for valid:{} t:{}".format(i, t._serialize()))
+            logger.debug(f"look for valid:{i} t:{t._serialize()}")
             if epoch_no < 0:
                 epoch_no = t._epoch_no
             else:
@@ -354,7 +350,7 @@ def _look_for_valid(self, cp_nos):
 
     def _get_last_valid_checkpoint(self):
         self._load_cp_nos = self._cper.get_checkpoint_no(self._checkpoint_path)
-        logger.info("find checkpoint nos:{}".format(self._load_cp_nos))
+        logger.info(f"find checkpoint nos:{self._load_cp_nos}")
 
         if len(self._load_cp_nos) < 1:
             self._restored_from = CONST_MEMORYINIT
@@ -371,9 +367,7 @@ def _get_last_valid_checkpoint(self):
             self._restored_from = CONST_CHECKPOINT
             self._checkpoint_epoch_no = self._epoch_no
 
-            logger.info(
-                "load tain_epoch_range checkpoint:{}".format(self._serialize())
-            )
+            logger.info(f"load tain_epoch_range checkpoint:{self._serialize()}")
 
         elif g_acp_type == CONST_DACP_TYPE:
             t, i = self._look_for_valid(self._load_cp_nos)
@@ -391,11 +385,9 @@ def _get_last_valid_checkpoint(self):
 
             self._restored_from = CONST_CHECKPOINT
             self._checkpoint_epoch_no = self._epoch_no
-            logger.info(
-                "load tain_epoch_range checkpoint:{}".format(self._serialize())
-            )
+            logger.info(f"load tain_epoch_range checkpoint:{self._serialize()}")
         else:
-            raise AssertionError("not supported acp_type:{}".format(g_acp_type))
+            raise AssertionError(f"not supported acp_type:{g_acp_type}")
 
     def _to_dict(self):
         d = {
@@ -416,7 +408,7 @@ def name(self):
         return self._name
 
     def serialize(self, path):
-        file_name = "{}/{}".format(path, self._file_name)
+        file_name = f"{path}/{self._file_name}"
         with open(file_name, 'w') as f:
             s = self._serialize()
             f.write(s)
@@ -440,7 +432,7 @@ def restored_from(self):
 
     def deserialize(self, path):
         d = None
-        file_name = "{}/{}".format(path, self._file_name)
+        file_name = f"{path}/{self._file_name}"
         with open(file_name, 'r') as f:
             d = json.load(f)
 
@@ -463,16 +455,14 @@ def next(self):
         if self._max_epoch_num < 0:
             self._max_epoch_num = sys.maxint
 
-        assert self._epoch_no >= -1, "self._epoch_no:{} must >=-1".format(
-            self._epoch_no
-        )
+        assert (
+            self._epoch_no >= -1
+        ), f"self._epoch_no:{self._epoch_no} must >=-1"
 
         self._last_checkpoint_time = time.time()
         start = self._epoch_no + 1
         logger.info(
-            "started epoch_no:{} max_epoch_num:{}".format(
-                start, self._max_epoch_num
-            )
+            f"started epoch_no:{start} max_epoch_num:{self._max_epoch_num}"
         )
 
         for i in range(start, self._max_epoch_num):
@@ -501,9 +491,7 @@ def save_checkpoint(self):
                 elif g_acp_type == CONST_DACP_TYPE:
                     self._save_checkpoint()
                 else:
-                    raise AssertionError(
-                        "not supported acp_type:{}".format(g_acp_type)
-                    )
+                    raise AssertionError("not supported acp_type:{g_acp_type}")
             self._last_checkpoint_time = time.time()
 
     def _save_checkpoint(self):
@@ -531,7 +519,7 @@ def _save_checkpoint(self):
 
             e[t._key] = t
 
-            logger.debug("save executor checkpoint:{}".format(t._serialize()))
+            logger.debug(f"save executor checkpoint:{t._serialize()}")
 
         if len(self._exe_status) > 0:
             self._cper.save_checkpoint(
@@ -540,7 +528,7 @@ def _save_checkpoint(self):
                 local_cache_path=self._checker._fs_cache,
             )
             logger.info(
-                "save train_epoch_range checkpoint:{}".format(self._serialize())
+                f"save train_epoch_range checkpoint:{self._serialize()}"
             )
 
             self._generate_flag()
@@ -607,9 +595,7 @@ def _can_auto_checkpoint(prog):
         g_program_attr[program._auto_checkpoint_name] = ret
         if not ret:
             logger.debug(
-                "program {} need't to auto checkpoint".format(
-                    program._auto_checkpoint_name
-                )
+                f"program {program._auto_checkpoint_name} need't to auto checkpoint"
             )
             return False
 
@@ -617,7 +603,7 @@ def _can_auto_checkpoint(prog):
 
 
 def _get_running_key(exe_name, program_name):
-    return "{}_{}".format(exe_name, program_name)
+    return f"{exe_name}_{program_name}"
 
 
 def _get_checker():
@@ -653,7 +639,7 @@ def train_epoch_range(max_epoch_num, save_checkpoint_inter=None):
         return
 
     g_acp_type = CONST_ACP_TYPE
-    logger.info("acp_type:{}".format(g_acp_type))
+    logger.info(f"acp_type:{g_acp_type}")
 
     global g_train_epoch_range
     try:
@@ -694,9 +680,7 @@ def _auto_checkpoint(exe, prog):
     if g_train_epoch_range.restored_from == CONST_CHECKPOINT:
         assert (
             key in exe_status
-        ), "when restored key:{} must be in train_epoch_range:{}".format(
-            key, g_train_epoch_range
-        )
+        ), f"when restored key:{key} must be in train_epoch_range:{g_train_epoch_range}"
 
     t = None
     if key in exe_status:
@@ -712,7 +696,7 @@ def _auto_checkpoint(exe, prog):
                 local_cache_path=g_checker._fs_cache,
             )
             t._restored_from = CONST_CHECKPOINT
-            logger.info("load executor checkpoint {}".format(t))
+            logger.info(f"load executor checkpoint {t}")
         t._exe = exe
         t._program = program
         t._epoch_no = g_train_epoch_range.get()
diff --git a/python/paddle/base/incubate/checkpoint/checkpoint_saver.py b/python/paddle/base/incubate/checkpoint/checkpoint_saver.py
index 0b113c2b87fc8..b597cf9c37f2f 100644
--- a/python/paddle/base/incubate/checkpoint/checkpoint_saver.py
+++ b/python/paddle/base/incubate/checkpoint/checkpoint_saver.py
@@ -69,17 +69,15 @@ def save_checkpoint(
         if not self._fs.is_exist(path):
             self._fs.mkdirs(path)
         else:
-            assert self._fs.is_dir(path), "path:{} must be a directory".format(
-                path
-            )
+            assert self._fs.is_dir(path), f"path:{path} must be a directory"
 
         max_no = self._get_last_checkpoint_no(path)
         if max_no < 0:
             max_no = -1
         max_no += 1
 
-        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no)
-        tmp_path = "{}.tmp".format(real_path)
+        real_path = f"{path}/{self._checkpoint_prefix}.{max_no}"
+        tmp_path = f"{real_path}.tmp"
         saved_path = tmp_path
 
         from paddle.distributed.fleet.utils.fs import LocalFS
@@ -93,14 +91,14 @@ def save_checkpoint(
             )
 
             if trainer_id is not None:
-                cache_path = "{}.{}".format(cache_path, trainer_id)
+                cache_path = f"{cache_path}.{trainer_id}"
 
             if not local_fs.is_exist(cache_path):
                 local_fs.mkdirs(cache_path)
             else:
                 assert local_fs.is_dir(
                     cache_path
-                ), "cache path:{} must be a directory".format(cache_path)
+                ), f"cache path:{cache_path} must be a directory"
 
             saved_path = cache_path
 
@@ -151,16 +149,14 @@ def load_checkpoint(
             )
 
             if trainer_id is not None:
-                cache_path = "{}.{}".format(cache_path, trainer_id)
+                cache_path = f"{cache_path}.{trainer_id}"
 
             if not local_fs.is_exist(local_cache_path):
                 local_fs.mkdirs(local_cache_path)
             if local_fs.is_exist(cache_path):
                 local_fs.delete(cache_path)
 
-        real_path = "{}/{}.{}".format(
-            path, self._checkpoint_prefix, checkpoint_no
-        )
+        real_path = f"{path}/{self._checkpoint_prefix}.{checkpoint_no}"
         load_path = real_path
         if self._fs.need_upload_download():
             self._fs.download(real_path, cache_path)
@@ -225,9 +221,7 @@ def clean_redundant_checkpoints(self, root_path, reserved=[]):
             try:
                 n = int(g[1])
                 if n not in s:
-                    path = "{}/{}.{}".format(
-                        root_path, self._checkpoint_prefix, n
-                    )
+                    path = f"{root_path}/{self._checkpoint_prefix}.{n}"
                     self._fs.delete(path)
             except Exception as e:
                 print(e)
diff --git a/python/paddle/base/layer_helper.py b/python/paddle/base/layer_helper.py
index 312eaf67a3320..333b176337a95 100644
--- a/python/paddle/base/layer_helper.py
+++ b/python/paddle/base/layer_helper.py
@@ -56,7 +56,7 @@ def multiple_input(self, input_param_name='input'):
     def input(self, input_param_name='input'):
         inputs = self.multiple_input(input_param_name)
         if len(inputs) != 1:
-            raise "{0} layer only takes one input".format(self.layer_type)
+            raise f"{self.layer_type} layer only takes one input"
         return inputs[0]
 
     @property
diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py
index 6c047c08766fe..51680a1abbc4e 100644
--- a/python/paddle/base/layer_helper_base.py
+++ b/python/paddle/base/layer_helper_base.py
@@ -291,12 +291,12 @@ def __weight_normalize(g, v, dim):
         g_param = self.startup_program.global_block().create_parameter(
             dtype=dtype,
             shape=g_param_shape,
-            **g_param_attr._to_kwargs(with_initializer=False)
+            **g_param_attr._to_kwargs(with_initializer=False),
         )
         v_param = self.startup_program.global_block().create_parameter(
             dtype=dtype,
             shape=v_param_shape,
-            **v_param_attr._to_kwargs(with_initializer=True)
+            **v_param_attr._to_kwargs(with_initializer=True),
         )
         __norm_except_dim(
             x=v_param,
@@ -354,7 +354,7 @@ def create_parameter(
         for i, size in enumerate(shape):
             assert size > 0, (
                 "Expected every dim's size to be larger than 0, "
-                "but the size of the {}-th dim is {}".format(i, size)
+                f"but the size of the {i}-th dim is {size}"
             )
         # set global dtype
         if not dtype:
@@ -430,20 +430,20 @@ def create_parameter(
                 shape=shape,
                 type=type,
                 stop_gradient=stop_gradient,
-                **attr._to_kwargs(with_initializer=True)
+                **attr._to_kwargs(with_initializer=True),
             )
         else:
             if in_pir_mode():
                 return paddle.ir.core.create_parameter(
                     dtype=dtype,
                     shape=shape,
-                    **attr._to_kwargs(with_initializer=True)
+                    **attr._to_kwargs(with_initializer=True),
                 )
             self.startup_program.global_block().create_parameter(
                 dtype=dtype,
                 shape=shape,
                 type=type,
-                **attr._to_kwargs(with_initializer=True)
+                **attr._to_kwargs(with_initializer=True),
             )
             return self.main_program.global_block().create_parameter(
                 dtype=dtype, shape=shape, type=type, **attr._to_kwargs()
diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py
index bd11a412ffc5b..82db72f7c4ce5 100644
--- a/python/paddle/base/layers/layer_function_generator.py
+++ b/python/paddle/base/layers/layer_function_generator.py
@@ -87,7 +87,7 @@ def _generate_doc_string_(
     buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
-        line_begin = '    {0}'.format(_convert_(each_input.name))
+        line_begin = f'    {_convert_(each_input.name)}'
         buf.write(line_begin)
         buf.write(" (Tensor): ")
         buf.write(escape_math(each_input.comment))
@@ -156,7 +156,7 @@ def generate_layer_fn(op_type):
     if len(not_intermediate_outputs) != 1:
         raise ValueError(
             "Only one non intermediate output operator can be",
-            "automatically generated. {0}".format(op_type),
+            f"automatically generated. {op_type}",
         )
 
     if not_intermediate_outputs[0].duplicable:
@@ -193,9 +193,7 @@ def infer_and_check_dtype(op_proto, *args, **kwargs):
 
             for each in val:
                 if not isinstance(each, Variable):
-                    raise ValueError(
-                        "input of {0} must be variable".format(op_type)
-                    )
+                    raise ValueError(f"input of {op_type} must be variable")
 
                 if dtype is None:
                     dtype = each.dtype
@@ -402,23 +400,17 @@ def __impl__(func):
         args = {"comment": trim_ending_dot(comment)}
         for each_input in op_proto.inputs:
             input_name = _convert_(each_input.name)
-            args["{0}_comment".format(input_name)] = trim_ending_dot(
-                each_input.comment
-            )
-            args["{0}_type".format(input_name)] = "Variable"
+            args[f"{input_name}_comment"] = trim_ending_dot(each_input.comment)
+            args[f"{input_name}_type"] = "Variable"
         for each_attr in op_proto.attrs:
             input_name = _convert_(each_attr.name)
-            args["{0}_comment".format(input_name)] = trim_ending_dot(
-                each_attr.comment
-            )
-            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
+            args[f"{input_name}_comment"] = trim_ending_dot(each_attr.comment)
+            args[f"{input_name}_type"] = _type_to_str_(each_attr.type)
 
         for each_opt in op_proto.outputs:
             output_name = _convert_(each_opt.name)
-            args["{0}_comment".format(output_name)] = trim_ending_dot(
-                each_opt.comment
-            )
-            args["{0}_type".format(output_name)] = "Variable"
+            args[f"{output_name}_comment"] = trim_ending_dot(each_opt.comment)
+            args[f"{output_name}_type"] = "Variable"
         func.__doc__ = tmpl.substitute(args)
         return func
 
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index 53f35939b1f3a..cba6b9a3b55de 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -314,9 +314,7 @@ def append(self, var):
                 var = to_tensor(var)
             else:
                 raise TypeError(
-                    "Required input var should be Variable, but received {}".format(
-                        type(var)
-                    )
+                    f"Required input var should be Variable, but received {type(var)}"
                 )
         if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
             raise TypeError(
@@ -336,9 +334,7 @@ def _item(self):
         """
         if len(self.shape) > 1:
             raise TypeError(
-                "Required input var should be 1-D Variable, but received {}".format(
-                    self.shape
-                )
+                f"Required input var should be 1-D Variable, but received {self.shape}"
             )
         return self
 
@@ -575,17 +571,15 @@ def __impl__(self, other_var):
 
         comment = OpProtoHolder.instance().get_op_proto(op_type).comment
 
-        __impl__.__doc__ = """
-        {0}
+        __impl__.__doc__ = f"""
+        {comment}
         Args:
             self(Variable): left hand variable
             other_var(Variable|float|int): right hand variable
 
         Returns:
             Variable
-        """.format(
-            comment
-        )
+        """
         __impl__.__name__ = method_name
         return __impl__
 
diff --git a/python/paddle/base/param_attr.py b/python/paddle/base/param_attr.py
index 674c4ad4328c5..75064a449db38 100644
--- a/python/paddle/base/param_attr.py
+++ b/python/paddle/base/param_attr.py
@@ -187,7 +187,7 @@ def _to_attr(arg):
         elif isinstance(arg, bool):
             return ParamAttr._to_attr(None) if arg else False
         else:
-            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
+            raise TypeError(f"{type(arg)} cast to ParamAttr")
 
     def _to_kwargs(self, with_initializer=False):
         """
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index c3a65721db275..4ec5d3c4a9607 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -1623,9 +1623,7 @@ def __init__(self, dataset, places, drop_last):
 
         if dataset.thread_num != 0 and dataset.thread_num != thread_num:
             logging.warn(
-                'thread_num {} which is set in Dataset is ignored'.format(
-                    dataset.thread_num
-                )
+                f'thread_num {dataset.thread_num} which is set in Dataset is ignored'
             )
 
         dataset._set_thread(thread_num)
@@ -1637,9 +1635,7 @@ def __init__(self, dataset, places, drop_last):
             and dataset.queue_num > thread_num
         ):
             logging.warn(
-                "queue_num {} which is set in Dataset is ignored".format(
-                    dataset.queue_num
-                )
+                f"queue_num {dataset.queue_num} which is set in Dataset is ignored"
             )
             dataset._set_queue_num(thread_num)
 
diff --git a/python/paddle/base/trainer_factory.py b/python/paddle/base/trainer_factory.py
index 75351872d73d6..e5c5fa48b7155 100644
--- a/python/paddle/base/trainer_factory.py
+++ b/python/paddle/base/trainer_factory.py
@@ -181,9 +181,7 @@ def handler_launch_func(self, scope, handler):
             if isinstance(fetch_instance.var_dict[key], Variable):
                 var_name_to_key[fetch_instance.var_dict[key].name] = key
             else:
-                local_logger.warning(
-                    "the value of {} is not a Variable".format(key)
-                )
+                local_logger.warning(f"the value of {key} is not a Variable")
                 var_name_to_key["None.var"] = key
         elapsed_secs = 0
         while True:
@@ -202,9 +200,7 @@ def handler_launch_func(self, scope, handler):
                     fetch_dict[key] = var
                     if var is None:
                         local_logger.warning(
-                            "{} value currently not available".format(
-                                var_name_to_key[key]
-                            )
+                            f"{var_name_to_key[key]} value currently not available"
                         )
                 res_dict = {}
                 for key in fetch_dict:
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index dcc87b74ea658..6d034b80c8d9c 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -97,9 +97,7 @@ def update(self, index):
                 self.pre_shape = self.indexes[-1].shape
         else:
             raise ValueError(
-                "Index should be list/tuple of int or Tensor, but received {}.".format(
-                    index
-                )
+                f"Index should be list/tuple of int or Tensor, but received {index}."
             )
 
     def shape_stride(self, shape):
@@ -116,9 +114,7 @@ def get_offset_stride(self, tensor_shape):
         for index in self.indexes:
             if not isinstance(index, paddle.base.Variable):
                 raise ValueError(
-                    "only support list/tensor index, but received {}.".format(
-                        type(index)
-                    )
+                    f"only support list/tensor index, but received {type(index)}."
                 )
 
         if len(self.indexes) <= len(tensor_shape) or len(self.indexes) == 1:
@@ -182,9 +178,7 @@ def set_item(self, tensor_origin, value):
                 or value_dims_bd[i] == 1
             ):
                 raise ValueError(
-                    "{} can not broadcast into {}".format(
-                        value.shape, gather_tensor_shape
-                    )
+                    f"{value.shape} can not broadcast into {gather_tensor_shape}"
                 )
 
         value_broadcast = paddle.broadcast_to(value, gather_tensor_shape)
@@ -324,7 +318,7 @@ def get_value_for_bool_tensor(var, item):
         raise IndexError(
             "The dims of bool index doesn't match indexed array, "
             "the dims of bool index except to be equal or less "
-            "than {}, but received {}.".format(len(var.shape), len(item.shape))
+            f"than {len(var.shape)}, but received {len(item.shape)}."
         )
     i = 0
     item_shape = item.shape
@@ -433,7 +427,7 @@ def _setitem_impl_(var, item, value):
             if not isinstance(step, Variable) and step == 0:
                 raise ValueError(
                     "When assign a value to a paddle.Tensor, step can not be 0, "
-                    "but received step is {}.".format(step)
+                    f"but received step is {step}."
                 )
 
             if isinstance(step, Variable) and (start is None or end is None):
@@ -454,9 +448,7 @@ def _setitem_impl_(var, item, value):
 
             for i in slice_item:
                 if not isinstance(i, bool):
-                    raise TypeError(
-                        "Doesn't support {} in index list.".format(type(i))
-                    )
+                    raise TypeError(f"Doesn't support {type(i)} in index list.")
 
             if len(item) != 1:
                 raise IndexError(
@@ -543,9 +535,7 @@ def _setitem_impl_(var, item, value):
     else:
         raise TypeError(
             "Only support to assign an integer, float, numpy.ndarray or "
-            "paddle.Tensor to a paddle.Tensor, but received {}".format(
-                type(value)
-            )
+            f"paddle.Tensor to a paddle.Tensor, but received {type(value)}"
         )
 
     if paddle.in_dynamic_mode():
@@ -587,7 +577,7 @@ def set_value_for_bool_tensor(var, item, value):
         raise IndexError(
             "The dims of bool index doesn't match indexed array, "
             "the dims of bool index except to be equal or less "
-            "than {}, but received {}.".format(len(var.shape), len(item.shape))
+            f"than {len(var.shape)}, but received {len(item.shape)}."
         )
     for i, dim_len in enumerate(item.shape):
         if dim_len != -1 and var.shape[i] != -1 and dim_len != var.shape[i]:
@@ -895,9 +885,7 @@ def _setitem_static(x, indices, values):
         else:
             raise TypeError(
                 "Only support to assign an integer, float, numpy.ndarray or "
-                "paddle.Tensor to a paddle.Tensor, but received {}".format(
-                    type(values)
-                )
+                f"paddle.Tensor to a paddle.Tensor, but received {type(values)}"
             )
 
         # step3.1: Only basic indexing, use OP set_value to set value.
@@ -908,7 +896,7 @@ def _setitem_static(x, indices, values):
                 StartsTensorList,
                 EndsTensorList,
                 StepsTensorList,
-                *itertools.chain.from_iterable(attrs.items())
+                *itertools.chain.from_iterable(attrs.items()),
             )
         else:
             helper = paddle.base.layer_helper.LayerHelper(

From e6ebffe88c45289eb1e5062cf5e4ff37edd952d5 Mon Sep 17 00:00:00 2001
From: Android zhang <53324261+zade23@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:16:49 +0800
Subject: [PATCH 051/115] [CodeStyle][task 23] enable Ruff PLR0402 rule in
 python/paddle/base (#57607)

---
 pyproject.toml                                     | 1 -
 python/paddle/base/dygraph/tensor_patch_methods.py | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 372895a3f02a1..b62b503c06e96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -110,7 +110,6 @@ ignore = [
     "UP030",
     "C405",
     "C417",
-    "PLR0402",
     "B004",
     "B009",
     "B016",
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index 4f1b138abaae4..506b8edfe88b1 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -19,14 +19,13 @@
 import numpy as np
 
 import paddle
-import paddle.profiler as profiler
-import paddle.utils.deprecated as deprecated
-from paddle import _C_ops
+from paddle import _C_ops, profiler
 from paddle.base.data_feeder import (
     _PADDLE_DTYPE_2_NUMPY_DTYPE,
     convert_uint16_to_float,
 )
 from paddle.profiler.utils import in_profiler_mode
+from paddle.utils import deprecated
 
 from .. import core, framework, unique_name
 from ..framework import (

From e5ee1dae6219a12fdcfd2ca4b467f5de9928a188 Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Fri, 22 Sep 2023 10:18:14 +0800
Subject: [PATCH 052/115] [xdoctest] reformat example code with google style in
 211,281,308,323 (#57301)

* fix sample codes

* fix code-style

* fix bug

* fix bug
---
 paddle/fluid/pybind/imperative.cc             | 103 ++-
 .../distributed/communication/stream/send.py  |   2 +-
 python/paddle/distributed/io.py               | 202 ++---
 python/paddle/tensor/random.py                | 751 +++++++++++-------
 4 files changed, 600 insertions(+), 458 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 295dffd53af42..80e6de0791961 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1027,15 +1027,15 @@ void BindImperative(py::module *m_ptr) {
                                  shape with the input numpy array.
 
   Examples:
-      .. code-block:: python
+        .. code-block:: python
 
-        # required: gpu
-        import numpy as np
-        import paddle
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import numpy as np
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
 
-        data = np.random.randint(10, size=(3, 4))
-        tensor = paddle.base.core.to_uva_tensor(data)
-        print(tensor)
+            >>> data = np.random.randint(10, size=(3, 4))
+            >>> tensor = paddle.base.core.to_uva_tensor(data)
 )DOC");
 
 #endif
@@ -1161,29 +1161,29 @@ void BindImperative(py::module *m_ptr) {
                     should be one-dimensinal.
 
   Examples:
-      .. code-block:: python
-
-          import numpy as np
-          import paddle
-          from paddle.base import core
-          from paddle.device import cuda
-
-          if core.is_compiled_with_cuda():
-              src = paddle.rand(shape=[100, 50, 50])
-              dst = paddle.emtpy(shape=[200, 50, 50]).pin_memory()
-              offset = paddle.to_tensor(
-                  np.array([0, 60], dtype="int64"), place=paddle.CPUPlace())
-              count = paddle.to_tensor(
-                  np.array([40, 60], dtype="int64"), place=paddle.CPUPlace())
-
-              stream = cuda.Stream()
-              with cuda.stream_guard(stream):
-                  core.async_write(src, dst, offset, count)
-
-              offset_a = paddle.gather(dst, paddle.to_tensor(np.arange(0, 40)))
-              offset_b = paddle.gather(dst, paddle.to_tensor(np.arange(60, 120)))
-              offset_array = paddle.concat([offset_a, offset_b], axis=0)
-              print(np.allclose(src.numpy(), offset_array.numpy())) # True
+        .. code-block:: python
+
+            >>> import numpy as np
+            >>> import paddle
+            >>> from paddle.base import core
+            >>> from paddle.device import cuda
+            >>> if core.is_compiled_with_cuda():
+            ...     src = paddle.rand(shape=[100, 50, 50])
+            ...     dst = paddle.empty(shape=[200, 50, 50]).pin_memory()
+            ...     offset = paddle.to_tensor(
+            ...         np.array([0, 60], dtype="int64"), place=paddle.CPUPlace())
+            ...     count = paddle.to_tensor(
+            ...         np.array([40, 60], dtype="int64"), place=paddle.CPUPlace())
+            ...
+            ...     stream = cuda.Stream()
+            ...     with cuda.stream_guard(stream):
+            ...         core.eager.async_write(src, dst, offset, count)
+            ...
+            ...     offset_a = paddle.gather(dst, paddle.to_tensor(np.arange(0, 40)))
+            ...     offset_b = paddle.gather(dst, paddle.to_tensor(np.arange(60, 120)))
+            ...     offset_array = paddle.concat([offset_a, offset_b], axis=0)
+            ...     print(np.allclose(src.numpy(), offset_array.numpy()))
+            True
 )DOC");
 
   m.def(
@@ -1393,28 +1393,27 @@ void BindImperative(py::module *m_ptr) {
                     should be one-dimensinal.
 
   Examples:
-      .. code-block:: python
-
-          import numpy as np
-          import paddle
-          from paddle.base import core
-          from paddle.device import cuda
-
-          if core.is_compiled_with_cuda():
-              src = paddle.rand(shape=[100, 50, 50], dtype="float32").pin_memory()
-              dst = paddle.empty(shape=[100, 50, 50], dtype="float32")
-              offset = paddle.to_tensor(
-                  np.array([0, 60], dtype="int64"), place=paddle.CPUPlace())
-              count = paddle.to_tensor(
-                  np.array([40, 60], dtype="int64"), place=paddle.CPUPlace())
-              buffer = paddle.empty(shape=[50, 50, 50], dtype="float32").pin_memory()
-              index = paddle.to_tensor(
-                  np.array([1, 3, 5, 7, 9], dtype="int64")).cpu()
-
-              stream = cuda.Stream()
-              with cuda.stream_guard(stream):
-                  core.async_read(src, dst, index, buffer, offset, count)
-
+        .. code-block:: python
+
+            >>> import numpy as np
+            >>> import paddle
+            >>> from paddle.base import core
+            >>> from paddle.device import cuda
+            ...
+            >>> if core.is_compiled_with_cuda():
+            ...     src = paddle.rand(shape=[100, 50, 50], dtype="float32").pin_memory()
+            ...     dst = paddle.empty(shape=[100, 50, 50], dtype="float32")
+            ...     offset = paddle.to_tensor(
+            ...         np.array([0, 60], dtype="int64"), place=paddle.CPUPlace())
+            ...     count = paddle.to_tensor(
+            ...         np.array([40, 60], dtype="int64"), place=paddle.CPUPlace())
+            ...     buffer = paddle.empty(shape=[50, 50, 50], dtype="float32").pin_memory()
+            ...     index = paddle.to_tensor(
+            ...         np.array([1, 3, 5, 7, 9], dtype="int64")).cpu()
+            ...
+            ...     stream = cuda.Stream()
+            ...     with cuda.stream_guard(stream):
+            ...         core.eager.async_read(src, dst, index, buffer, offset, count)
 )DOC");
 #endif
 }
diff --git a/python/paddle/distributed/communication/stream/send.py b/python/paddle/distributed/communication/stream/send.py
index c04ba6b5736b7..2013c619f278f 100644
--- a/python/paddle/distributed/communication/stream/send.py
+++ b/python/paddle/distributed/communication/stream/send.py
@@ -94,7 +94,7 @@ def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
             >>> task.wait()
             >>> out = data.numpy()
             >>> print(out)
-            >>> # [[4, 5, 6], [4, 5, 6]] (2 GPUs)
+            [[4, 5, 6], [4, 5, 6]]
     """
     if _warn_cur_rank_not_in_group(group):
         return
diff --git a/python/paddle/distributed/io.py b/python/paddle/distributed/io.py
index 69f6e42bf1954..8b104bd770a96 100644
--- a/python/paddle/distributed/io.py
+++ b/python/paddle/distributed/io.py
@@ -37,16 +37,17 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
-
-            paddle.enable_static()
-            exe = base.Executor(base.CPUPlace())
-            param_path = "./my_paddle_model"
-            t = paddle.distributed.transpiler.DistributeTranspiler()
-            t.transpile(...)
-            pserver_prog = t.get_pserver_program(...)
-            _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog)
+            >>> # doctest: +REQUIRES(env: DISTRIBUTED)
+            >>> import paddle
+            >>> import paddle.base as base
+
+            >>> paddle.enable_static()
+            >>> exe = base.Executor(base.CPUPlace())
+            >>> param_path = "./my_paddle_model"
+            >>> t = paddle.distributed.transpiler.DistributeTranspiler()
+            >>> t.transpile(...)
+            >>> pserver_prog = t.get_pserver_program(...)
+            >>> _load_distributed_persistables(executor=exe, dirname=param_path, main_program=pserver_prog)
     """
 
     def __is_distributed_part_var(varname):
@@ -160,15 +161,15 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
+            >>> import paddle
+            >>> import paddle.base as base
 
-            paddle.enable_static()
-            exe = base.Executor(base.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = base.default_main_program()
-            paddle.distributed.io.load_persistables(executor=exe, dirname=param_path,
-                                       main_program=None)
+            >>> paddle.enable_static()
+            >>> exe = base.Executor(base.CPUPlace())
+            >>> param_path = "./my_paddle_model"
+            >>> prog = base.default_main_program()
+            >>> paddle.distributed.io.load_persistables(executor=exe, dirname=param_path,
+            ...                             main_program=None)
     """
 
     if main_program and main_program._is_distributed:
@@ -207,16 +208,17 @@ def _save_distributed_persistables(executor, dirname, main_program):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle
-
-            paddle.enable_static()
-            exe = paddle.static.Executor(paddle.CPUPlace())
-            param_path = "./my_paddle_model"
-            t = paddle.distributed.transpiler.DistributeTranspiler()
-            t.transpile(...)
-            train_program = t.get_trainer_program()
-            _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program)
+            >>> # doctest: +REQUIRES(env: DISTRIBUTED)
+            >>> import paddle
+            >>> import paddle
+
+            >>> paddle.enable_static()
+            >>> exe = paddle.static.Executor(paddle.CPUPlace())
+            >>> param_path = "./my_paddle_model"
+            >>> t = paddle.distributed.transpiler.DistributeTranspiler()
+            >>> t.transpile(...)
+            >>> train_program = t.get_trainer_program()
+            >>> _save_distributed_persistables(executor=exe, dirname=param_path, main_program=train_program)
     """
 
     def __save_remote_params(executor, dirname, remote_params_map):
@@ -366,12 +368,16 @@ def is_persistable(var):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
 
-            paddle.enable_static()
-            param = base.default_main_program().global_block().var('fc.b')
-            res = base.io.is_persistable(param)
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> image = paddle.static.data(
+            ...     name='image', shape=[None, 28], dtype='float32')
+            >>> bias_attr = paddle.ParamAttr('fc.b')
+            >>> fc = paddle.static.nn.fc(image, size=10, bias_attr=bias_attr)
+            >>> param = paddle.static.default_main_program().global_block().var('fc.b')
+            >>> res = paddle.distributed.io.is_persistable(param)
+
     """
     if (
         var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH
@@ -420,24 +426,24 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            paddle.enable_static()
-            dir_path = "./my_paddle_model"
-            file_name = "persistables"
-            image = paddle.static..data(name='img', shape=[None, 28, 28], dtype='float32')
-            label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
-            feeder = paddle.static.DataFeeder(feed_list=[image, label], place=paddle.CPUPlace())
-
-            predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
-            loss = paddle.nn.functional.cross_entropy(input=predict, label=label)
-            avg_loss = paddle.mean(loss)
-            exe = paddle.static.Executor(paddle.CPUPlace())
-            exe.run(paddle.static.default_startup_program())
-            paddle.distributed.io.save_persistables(executor=exe, dirname=dir_path, filename=file_name)
-            # The persistables variables weights and bias in the fc layer of the network
-            # are going to be saved in the same file named "persistables" in the path
-            # "./my_paddle_model"
+            >>> import paddle
+
+            >>> paddle.enable_static()
+            >>> dir_path = "./my_paddle_model"
+            >>> file_name = "persistables"
+            >>> image = paddle.static.data(name='img', shape=[None, 28, 28], dtype='float32')
+            >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
+            >>> feeder = paddle.base.DataFeeder(feed_list=[image, label], place=paddle.CPUPlace())
+
+            >>> predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
+            >>> loss = paddle.nn.functional.cross_entropy(input=predict, label=label)
+            >>> avg_loss = paddle.mean(loss)
+            >>> exe = paddle.static.Executor(paddle.CPUPlace())
+            >>> exe.run(paddle.static.default_startup_program())
+            >>> paddle.distributed.io.save_persistables(executor=exe, dirname=dir_path, filename=file_name)
+            >>> # The persistables variables weights and bias in the fc layer of the network
+            >>> # are going to be saved in the same file named "persistables" in the path
+            >>> # "./my_paddle_model"
     """
     if main_program and main_program._is_distributed:
         return _save_distributed_persistables(
@@ -504,53 +510,53 @@ def load_inference_model_distributed(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
-            import numpy as np
-
-            paddle.enable_static()
-            # Build the model
-            main_prog = base.Program()
-            startup_prog = base.Program()
-            with base.program_guard(main_prog, startup_prog):
-                data = base.layers.data(name="img", shape=[64, 784], append_batch_size=False)
-                w = paddle.create_parameter(shape=[784, 200], dtype='float32')
-                b = paddle.create_parameter(shape=[200], dtype='float32')
-                hidden_w = paddle.matmul(x=data, y=w)
-                hidden_b = base.layers.elementwise_add(hidden_w, b)
-            place = base.CPUPlace()
-            exe = base.Executor(place)
-            exe.run(startup_prog)
-
-            # Save the inference model
-            path = "./infer_model"
-            base.io.save_inference_model(dirname=path, feeded_var_names=['img'],
-                         target_vars=[hidden_b], executor=exe, main_program=main_prog)
-
-            # Demo one. Not need to set the distributed look up table, because the
-            # training doesn't use a distributed look up table.
-            [inference_program, feed_target_names, fetch_targets] = (
-                paddle.distributed.io.load_inference_model_distributed(dirname=path, executor=exe))
-            tensor_img = np.array(np.random.random((1, 64, 784)), dtype=np.float32)
-            results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_img},
-                          fetch_list=fetch_targets)
-
-            # Demo two. If the training uses a distributed look up table, the pserver
-            # endpoints list should be supported when loading the inference model.
-            # The below is just an example.
-            endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
-            [dist_inference_program, dist_feed_target_names, dist_fetch_targets] = (
-                paddle.distributed.io.load_inference_model_distributed(dirname=path,
-                                              executor=exe,
-                                              pserver_endpoints=endpoints))
-
-            # In this example, the inference program was saved in the file
-            # "./infer_model/__model__" and parameters were saved in
-            # separate files under the directory "./infer_model".
-            # By the inference program, feed_target_names and
-            # fetch_targets, we can use an executor to run the inference
-            # program for getting the inference result.
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+            >>> # Build the model
+            >>> main_prog = paddle.static.Program()
+            >>> startup_prog = paddle.static.Program()
+            >>> with paddle.static.program_guard(main_prog, startup_prog):
+            ...     data = paddle.static.data(name="img", shape=[64, 784], append_batch_size=False)
+            ...     w = paddle.create_parameter(shape=[784, 200], dtype='float32')
+            ...     b = paddle.create_parameter(shape=[200], dtype='float32')
+            ...     hidden_w = paddle.matmul(x=data, y=w)
+            ...     hidden_b = base.layers.elementwise_add(hidden_w, b)
+            >>> place = base.CPUPlace()
+            >>> exe = base.Executor(place)
+            >>> exe.run(startup_prog)
+
+            >>> # Save the inference model
+            >>> path = "./infer_model"
+            >>> base.io.save_inference_model(dirname=path, feeded_var_names=['img'],
+            ...                 target_vars=[hidden_b], executor=exe, main_program=main_prog)
+            ...
+            >>> # Demo one. Not need to set the distributed look up table, because the
+            >>> # training doesn't use a distributed look up table.
+            >>> [inference_program, feed_target_names, fetch_targets] = (
+            ...     paddle.distributed.io.load_inference_model_distributed(dirname=path, executor=exe))
+            >>> tensor_img = np.array(np.random.random((1, 64, 784)), dtype=np.float32)
+            >>> results = exe.run(inference_program,
+            ...                 feed={feed_target_names[0]: tensor_img},
+            ...                 fetch_list=fetch_targets)
+            ...
+            >>> # Demo two. If the training uses a distributed look up table, the pserver
+            >>> # endpoints list should be supported when loading the inference model.
+            >>> # The below is just an example.
+            >>> endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
+            >>> [dist_inference_program, dist_feed_target_names, dist_fetch_targets] = (
+            ...     paddle.distributed.io.load_inference_model_distributed(dirname=path,
+            ...                                     executor=exe,
+            ...                                     pserver_endpoints=endpoints))
+            ...
+            >>> # In this example, the inference program was saved in the file
+            >>> # "./infer_model/__model__" and parameters were saved in
+            >>> # separate files under the directory "./infer_model".
+            >>> # By the inference program, feed_target_names and
+            >>> # fetch_targets, we can use an executor to run the inference
+            >>> # program for getting the inference result.
     """
     load_from_memory = False
     if dirname is not None:
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 4e16d8b022887..f32978ca50706 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -57,20 +57,26 @@ def bernoulli(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            paddle.set_device('cpu')  # on CPU device
-            paddle.seed(100)
-
-            x = paddle.rand([2,3])
-            print(x)
-            # [[0.55355281, 0.20714243, 0.01162981],
-            #  [0.51577556, 0.36369765, 0.26091650]]
-
-            out = paddle.bernoulli(x)
-            print(out)
-            # [[1., 0., 1.],
-            #  [0., 1., 0.]]
+            >>> import paddle
+
+            >>> paddle.set_device('cpu')  # on CPU device
+            >>> paddle.seed(100)
+
+            >>> x = paddle.rand([2,3])
+            >>> print(x)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.55355281, 0.20714243, 0.01162981],
+             [0.51577556, 0.36369765, 0.26091650]])
+            >>> # doctest: -SKIP
+
+            >>> out = paddle.bernoulli(x)
+            >>> print(out)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[1., 0., 1.],
+             [0., 1., 0.]])
+            >>> # doctest: -SKIP
 
     """
 
@@ -112,15 +118,18 @@ def poisson(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.set_device('cpu')
-            paddle.seed(100)
-
-            x = paddle.uniform([2,3], min=1.0, max=5.0)
-            out = paddle.poisson(x)
-            #[[2., 5., 0.],
-            # [5., 1., 3.]]
-
+            >>> import paddle
+            >>> paddle.set_device('cpu')
+            >>> paddle.seed(100)
+
+            >>> x = paddle.uniform([2,3], min=1.0, max=5.0)
+            >>> out = paddle.poisson(x)
+            >>> print(out)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[2., 5., 0.],
+             [5., 1., 3.]])
+            >>> # doctest: -SKIP
     """
     if in_dynamic_mode():
         return _C_ops.poisson(x)
@@ -157,29 +166,38 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            paddle.seed(100) # on CPU device
-            x = paddle.rand([2,4])
-            print(x)
-            # [[0.5535528  0.20714243 0.01162981 0.51577556]
-            # [0.36369765 0.2609165  0.18905126 0.5621971 ]]
-
-            paddle.seed(200) # on CPU device
-            out1 = paddle.multinomial(x, num_samples=5, replacement=True)
-            print(out1)
-            # [[3 3 0 0 0]
-            # [3 3 3 1 0]]
-
-            # out2 = paddle.multinomial(x, num_samples=5)
-            # InvalidArgumentError: When replacement is False, number of samples
-            #  should be less than non-zero categories
-
-            paddle.seed(300) # on CPU device
-            out3 = paddle.multinomial(x, num_samples=3)
-            print(out3)
-            # [[3 0 1]
-            # [3 1 0]]
+            >>> import paddle
+            >>> paddle.seed(100) # on CPU device
+
+            >>> x = paddle.rand([2,4])
+            >>> print(x)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.55355281, 0.20714243, 0.01162981, 0.51577556],
+             [0.36369765, 0.26091650, 0.18905126, 0.56219709]])
+            >>> # doctest: -SKIP
+
+            >>> paddle.seed(200) # on CPU device
+            >>> out1 = paddle.multinomial(x, num_samples=5, replacement=True)
+            >>> print(out1)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 5], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[3, 3, 0, 0, 0],
+             [3, 3, 3, 1, 0]])
+            >>> # doctest: -SKIP
+
+            >>> # out2 = paddle.multinomial(x, num_samples=5)
+            >>> # InvalidArgumentError: When replacement is False, number of samples
+            >>> #  should be less than non-zero categories
+
+            >>> paddle.seed(300) # on CPU device
+            >>> out3 = paddle.multinomial(x, num_samples=3)
+            >>> print(out3)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[3, 0, 1],
+             [3, 1, 0]])
+            >>> # doctest: -SKIP
 
     """
 
@@ -254,15 +272,21 @@ def uniform_random_batch_size_like(
         Variable: A Tensor of the specified shape filled with uniform_random values. The shape of the Tensor is determined by the shape parameter and the specified dimension of the input Tensor.
     Examples:
         .. code-block:: python
-            import paddle
-            import paddle.base as base
-            from paddle.tensor import random
-            paddle.enable_static()
-            # example 1:
-            input = paddle.static.data(name="input", shape=[1, 3], dtype='float32')
-            out_1 = random.uniform_random_batch_size_like(input, [2, 4]) # out_1.shape=[1, 4]
-            # example 2:
-            out_2 = random.uniform_random_batch_size_like(input, [2, 4], input_dim_idx=1, output_dim_idx=1) # out_2.shape=[2, 3]
+
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> from paddle.tensor import random
+            >>> paddle.enable_static()
+            >>> # example 1:
+            >>> input = paddle.static.data(name="input", shape=[1, 3], dtype='float32')
+            >>> out_1 = random.uniform_random_batch_size_like(input, [2, 4])
+            >>> print(out_1.shape)
+            [1, 4]
+
+            >>> # example 2:
+            >>> out_2 = random.uniform_random_batch_size_like(input, [2, 4], input_dim_idx=1, output_dim_idx=1)
+            >>> print(out_2.shape)
+            [2, 3]
     """
     check_variable_and_dtype(
         input,
@@ -395,29 +419,42 @@ def standard_normal(shape, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            # example 1: attr shape is a list which doesn't contain Tensor.
-            out1 = paddle.standard_normal(shape=[2, 3])
-            # [[-2.923464  ,  0.11934398, -0.51249987],  # random
-            #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
-
-            # example 2: attr shape is a list which contains Tensor.
-            dim1 = paddle.to_tensor(2, 'int64')
-            dim2 = paddle.to_tensor(3, 'int32')
-            out2 = paddle.standard_normal(shape=[dim1, dim2, 2])
-            # [[[-2.8852394 , -0.25898588],  # random
-            #   [-0.47420555,  0.17683524],  # random
-            #   [-0.7989969 ,  0.00754541]],  # random
-            #  [[ 0.85201347,  0.32320443],  # random
-            #   [ 1.1399018 ,  0.48336947],  # random
-            #   [ 0.8086993 ,  0.6868893 ]]]  # random
-
-            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            shape_tensor = paddle.to_tensor([2, 3])
-            out3 = paddle.standard_normal(shape_tensor)
-            # [[-2.878077 ,  0.17099959,  0.05111201]  # random
-            #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
+            >>> import paddle
+
+            >>> # doctest: +SKIP("Random output")
+            >>> # example 1: attr shape is a list which doesn't contain Tensor.
+            >>> out1 = paddle.standard_normal(shape=[2, 3])
+            >>> print(out1)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.33719197, -0.25688133, -0.42868865],
+             [-0.27804616, -0.25058213, -0.28209466]])
+            >>> # doctest: -SKIP
+
+            >>> # example 2: attr shape is a list which contains Tensor.
+            >>> dim1 = paddle.to_tensor(2, 'int64')
+            >>> dim2 = paddle.to_tensor(3, 'int32')
+            >>> out2 = paddle.standard_normal(shape=[dim1, dim2, 2])
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[ 0.81888396, -0.64831746],
+              [ 1.28911388, -1.88154876],
+              [-0.03271919, -0.32410008]],
+             [[-0.20224631,  0.46683890],
+              [ 1.91947734,  0.71657443],
+              [ 0.33410960, -0.64256823]]])
+            >>> # doctest: -SKIP
+
+            >>> # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            >>> shape_tensor = paddle.to_tensor([2, 3])
+            >>> out3 = paddle.standard_normal(shape_tensor)
+            >>> print(out3)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.01182475, -0.44895259, -1.79227340],
+             [ 1.52022707, -0.83830303,  0.05261501]])
+            >>> # doctest: -SKIP
 
     """
     return gaussian(shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
@@ -448,29 +485,41 @@ def randn(shape, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            # example 1: attr shape is a list which doesn't contain Tensor.
-            out1 = paddle.randn(shape=[2, 3])
-            # [[-2.923464  ,  0.11934398, -0.51249987],  # random
-            #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
-
-            # example 2: attr shape is a list which contains Tensor.
-            dim1 = paddle.to_tensor(2, 'int64')
-            dim2 = paddle.to_tensor(3, 'int32')
-            out2 = paddle.randn(shape=[dim1, dim2, 2])
-            # [[[-2.8852394 , -0.25898588],  # random
-            #   [-0.47420555,  0.17683524],  # random
-            #   [-0.7989969 ,  0.00754541]],  # random
-            #  [[ 0.85201347,  0.32320443],  # random
-            #   [ 1.1399018 ,  0.48336947],  # random
-            #   [ 0.8086993 ,  0.6868893 ]]]  # random
-
-            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            shape_tensor = paddle.to_tensor([2, 3])
-            out3 = paddle.randn(shape_tensor)
-            # [[-2.878077 ,  0.17099959,  0.05111201]  # random
-            #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
+            >>> import paddle
+
+            >>> # example 1: attr shape is a list which doesn't contain Tensor.
+            >>> out1 = paddle.randn(shape=[2, 3])
+            >>> print(out1)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.29270014, -0.02925120, -1.07807338],
+             [ 1.19966674, -0.46673676, -0.18050613]])
+            >>> # doctest: -SKIP
+
+            >>> # example 2: attr shape is a list which contains Tensor.
+            >>> dim1 = paddle.to_tensor(2, 'int64')
+            >>> dim2 = paddle.to_tensor(3, 'int32')
+            >>> out2 = paddle.randn(shape=[dim1, dim2, 2])
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[-0.26019713,  0.54994684],
+              [ 0.46403214, -1.41178775],
+              [-0.15682915, -0.26639181]],
+             [[ 0.01364388, -2.81676364],
+              [ 0.86996621,  0.07524570],
+              [ 0.21443737,  0.90938759]]])
+            >>> # doctest: -SKIP
+
+            >>> # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            >>> shape_tensor = paddle.to_tensor([2, 3])
+            >>> out3 = paddle.randn(shape_tensor)
+            >>> print(out3)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.57575506, -1.60349274, -0.27124876],
+             [ 1.08381045,  0.81270242, -0.26763600]])
+            >>> # doctest: -SKIP
     """
     return standard_normal(shape, dtype, name)
 
@@ -509,20 +558,31 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            out1 = paddle.normal(shape=[2, 3])
-            # [[ 0.17501129  0.32364586  1.561118  ]  # random
-            #  [-1.7232178   1.1545963  -0.76156676]]  # random
-
-            mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
-            out2 = paddle.normal(mean=mean_tensor)
-            # [ 0.18644847 -1.19434458  3.93694787]  # random
-
-            std_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
-            out3 = paddle.normal(mean=mean_tensor, std=std_tensor)
-            # [1.00780561 3.78457445 5.81058198]  # random
-
+            >>> import paddle
+
+            >>> out1 = paddle.normal(shape=[2, 3])
+            >>> print(out1)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.85107994, -0.85490644, -1.35941815],
+             [-0.55500370,  0.20964541,  2.24193954]])
+            >>> # doctest: -SKIP
+
+            >>> mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
+            >>> out2 = paddle.normal(mean=mean_tensor)
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [1.05411839, 3.71514320, 3.42665267])
+            >>> # doctest: -SKIP
+
+            >>> std_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
+            >>> out3 = paddle.normal(mean=mean_tensor, std=std_tensor)
+            >>> print(out3)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [0.48646951, 0.00815189, 3.74022293])
+            >>> # doctest: -SKIP
     """
     if not in_dynamic_mode():
         check_type(mean, 'mean', (int, float, Variable), 'normal')
@@ -606,31 +666,43 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
     Examples:
         .. code-block:: python
-          :name: code-example1
-
-            import paddle
-
-            # example 1:
-            # attr shape is a list which doesn't contain Tensor.
-            out1 = paddle.uniform(shape=[3, 4])
-            # [[ 0.84524226,  0.6921872,   0.56528175,  0.71690357], # random
-            #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249], # random
-            #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]] # random
-
-            # example 2:
-            # attr shape is a list which contains Tensor.
-            dim1 = paddle.to_tensor(2, 'int64')
-            dim2 = paddle.to_tensor(3, 'int32')
-            out2 = paddle.uniform(shape=[dim1, dim2])
-            # [[-0.9951253,   0.30757582, 0.9899647 ], # random
-            #  [ 0.5864527,   0.6607096,  -0.8886161]] # random
-
-            # example 3:
-            # attr shape is a Tensor, the data type must be int64 or int32.
-            shape_tensor = paddle.to_tensor([2, 3])
-            out3 = paddle.uniform(shape_tensor)
-            # [[-0.8517412,  -0.4006908,   0.2551912 ], # random
-            #  [ 0.3364414,   0.36278176, -0.16085452]] # random
+            :name: code-example1
+
+            >>> import paddle
+
+            >>> # example 1:
+            >>> # attr shape is a list which doesn't contain Tensor.
+            >>> out1 = paddle.uniform(shape=[3, 4])
+            >>> print(out1)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.38170254, -0.47945309,  0.39794648, -0.94233936],
+             [-0.85296679, -0.76094693,  0.10565400,  0.59155810],
+             [ 0.11681318, -0.42144555, -0.81596589,  0.62113667]])
+            >>> # doctest: -SKIP
+
+            >>> # example 2:
+            >>> # attr shape is a list which contains Tensor.
+            >>> dim1 = paddle.to_tensor(2, 'int64')
+            >>> dim2 = paddle.to_tensor(3, 'int32')
+            >>> out2 = paddle.uniform(shape=[dim1, dim2])
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.00294012, -0.07210171, -0.44236207],
+             [ 0.70089281,  0.21500075, -0.22084606]])
+            >>> # doctest: -SKIP
+
+            >>> # example 3:
+            >>> # attr shape is a Tensor, the data type must be int64 or int32.
+            >>> shape_tensor = paddle.to_tensor([2, 3])
+            >>> out3 = paddle.uniform(shape_tensor)
+            >>> print(out3)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.60801756,  0.32448411,  0.90269291],
+             [-0.66421294, -0.95218551, -0.51022208]])
+            >>> # doctest: -SKIP
     """
     supported_dtypes = ['float32', 'float64', 'float16', 'uint16']
     if dtype is None:
@@ -705,14 +777,17 @@ def uniform_(x, min=-1.0, max=1.0, seed=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            # example:
-            x = paddle.ones(shape=[3, 4])
-            x.uniform_()
-            print(x)
-            # [[ 0.84524226,  0.6921872,   0.56528175,  0.71690357], # random
-            #  [-0.34646994, -0.45116323, -0.09902662, -0.11397249], # random
-            #  [ 0.433519,    0.39483607, -0.8660099,   0.83664286]] # random
+            >>> import paddle
+
+            >>> # example:
+            >>> x = paddle.ones(shape=[3, 4])
+            >>> x.uniform_()
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[-0.50484276,  0.49580324,  0.33357990, -0.93924278],
+             [ 0.39779735,  0.87677515, -0.24377221,  0.06212139],
+             [-0.92499518, -0.96244860,  0.79210341, -0.78228098]])
+            >>> # doctest: -SKIP
     """
     return _C_ops.uniform_inplace_(x, min, max, seed, 0, 0, 1.0)
 
@@ -747,38 +822,59 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            # example 1:
-            # attr shape is a list which doesn't contain Tensor.
-            out1 = paddle.randint(low=-5, high=5, shape=[2, 3])
-            # [0, -3, 2]  # random
-
-            # example 2:
-            # attr shape is a list which contains Tensor.
-            dim1 = paddle.to_tensor(2, 'int64')
-            dim2 = paddle.to_tensor(3, 'int32')
-            out2 = paddle.randint(low=-5, high=5, shape=[dim1, dim2])
-            # [[0, -1, -3],  # random
-            #  [4, -2,  0]]  # random
-
-            # example 3:
-            # attr shape is a Tensor
-            shape_tensor = paddle.to_tensor([2, 3])
-            out3 = paddle.randint(low=-5, high=5, shape=shape_tensor)
-            # [[ 2, -3, -1],    # random
-            #  [-3, -2,  1]])   # random
-
-            # example 4:
-            # data type is int32
-            out4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
-            # [-5, 4, -4]  # random
-
-            # example 5:
-            # Input only one parameter
-            # low=0, high=10, shape=[1], dtype='int64'
-            out5 = paddle.randint(10)
-            # [7]  # random
+            >>> import paddle
+
+            >>> # example 1:
+            >>> # attr shape is a list which doesn't contain Tensor.
+            >>> out1 = paddle.randint(low=-5, high=5, shape=[2, 3])
+            >>> print(out1)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[-1,  4,  4],
+             [-2, -5, -2]])
+            >>> # doctest: -SKIP
+
+            >>> # example 2:
+            >>> # attr shape is a list which contains Tensor.
+            >>> dim1 = paddle.to_tensor(2, 'int64')
+            >>> dim2 = paddle.to_tensor(3, 'int32')
+            >>> out2 = paddle.randint(low=-5, high=5, shape=[dim1, dim2])
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[-4, -4,  2],
+             [-3, -1, -5]])
+            >>> # doctest: -SKIP
+
+            >>> # example 3:
+            >>> # attr shape is a Tensor
+            >>> shape_tensor = paddle.to_tensor([2, 3])
+            >>> out3 = paddle.randint(low=-5, high=5, shape=shape_tensor)
+            >>> print(out3)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[-1,  4, -3],
+             [ 1,  2, -1]])
+            >>> # doctest: -SKIP
+
+            >>> # example 4:
+            >>> # data type is int32
+            >>> out4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+            >>> print(out4)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[3], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [4, 4, 0])
+            >>> # doctest: -SKIP
+
+            >>> # example 5:
+            >>> # Input only one parameter
+            >>> # low=0, high=10, shape=[1], dtype='int64'
+            >>> out5 = paddle.randint(10)
+            >>> print(out5)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [7])
+            >>> # doctest: -SKIP
 
     """
     if high is None:
@@ -854,97 +950,115 @@ def randint_like(x, low=0, high=None, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            # example 1:
-            # dtype is None and the dtype of x is float16
-            x = paddle.zeros((1,2)).astype("float16")
-            out1 = paddle.randint_like(x, low=-5, high=5)
-            print(out1)
-            print(out1.dtype)
-            # [[0, -3]]  # random
-            # paddle.float16
-
-            # example 2:
-            # dtype is None and the dtype of x is float32
-            x = paddle.zeros((1,2)).astype("float32")
-            out2 = paddle.randint_like(x, low=-5, high=5)
-            print(out2)
-            print(out2.dtype)
-            # [[0, -3]]  # random
-            # paddle.float32
-
-            # example 3:
-            # dtype is None and the dtype of x is float64
-            x = paddle.zeros((1,2)).astype("float64")
-            out3 = paddle.randint_like(x, low=-5, high=5)
-            print(out3)
-            print(out3.dtype)
-            # [[0, -3]]  # random
-            # paddle.float64
-
-            # example 4:
-            # dtype is None and the dtype of x is int32
-            x = paddle.zeros((1,2)).astype("int32")
-            out4 = paddle.randint_like(x, low=-5, high=5)
-            print(out4)
-            print(out4.dtype)
-            # [[0, -3]]  # random
-            # paddle.int32
-
-            # example 5:
-            # dtype is None and the dtype of x is int64
-            x = paddle.zeros((1,2)).astype("int64")
-            out5 = paddle.randint_like(x, low=-5, high=5)
-            print(out5)
-            print(out5.dtype)
-            # [[0, -3]]  # random
-            # paddle.int64
-
-            # example 6:
-            # dtype is float64 and the dtype of x is float32
-            x = paddle.zeros((1,2)).astype("float32")
-            out6 = paddle.randint_like(x, low=-5, high=5, dtype="float64")
-            print(out6)
-            print(out6.dtype)
-            # [[0, -1]]  # random
-            # paddle.float64
-
-            # example 7:
-            # dtype is bool and the dtype of x is float32
-            x = paddle.zeros((1,2)).astype("float32")
-            out7 = paddle.randint_like(x, low=-5, high=5, dtype="bool")
-            print(out7)
-            print(out7.dtype)
-            # [[0, -1]]  # random
-            # paddle.bool
-
-            # example 8:
-            # dtype is int32 and the dtype of x is float32
-            x = paddle.zeros((1,2)).astype("float32")
-            out8 = paddle.randint_like(x, low=-5, high=5, dtype="int32")
-            print(out8)
-            print(out8.dtype)
-            # [[0, -1]]  # random
-            # paddle.int32
-
-            # example 9:
-            # dtype is int64 and the dtype of x is float32
-            x = paddle.zeros((1,2)).astype("float32")
-            out9 = paddle.randint_like(x, low=-5, high=5, dtype="int64")
-            print(out9)
-            print(out9.dtype)
-            # [[0, -1]]  # random
-            # paddle.int64
-
-            # example 10:
-            # dtype is int64 and the dtype of x is bool
-            x = paddle.zeros((1,2)).astype("bool")
-            out10 = paddle.randint_like(x, low=-5, high=5, dtype="int64")
-            print(out10)
-            print(out10.dtype)
-            # [[0, -1]]  # random
-            # paddle.int64
+            >>> import paddle
+
+            >>> # example 1:
+            >>> # dtype is None and the dtype of x is float32
+            >>> x = paddle.zeros((1,2)).astype("float32")
+            >>> out2 = paddle.randint_like(x, low=-5, high=5)
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 0.]])
+            >>> # doctest: -SKIP
+            >>> print(out2.dtype)
+            paddle.float32
+
+            >>> # example 2:
+            >>> # dtype is None and the dtype of x is float64
+            >>> x = paddle.zeros((1,2)).astype("float64")
+            >>> out2 = paddle.randint_like(x, low=-5, high=5)
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[ 4., -5.]])
+            >>> # doctest: -SKIP
+            >>> print(out2.dtype)
+            paddle.float64
+
+            >>> # example 3:
+            >>> # dtype is None and the dtype of x is int32
+            >>> x = paddle.zeros((1,2)).astype("int32")
+            >>> out3 = paddle.randint_like(x, low=-5, high=5)
+            >>> print(out3)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[ 0, -4]])
+            >>> # doctest: -SKIP
+            >>> print(out3.dtype)
+            paddle.int32
+
+            >>> # example 4:
+            >>> # dtype is None and the dtype of x is int64
+            >>> x = paddle.zeros((1,2)).astype("int64")
+            >>> out4 = paddle.randint_like(x, low=-5, high=5)
+            >>> print(out4)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[ 4, -3]])
+            >>> # doctest: -SKIP
+            >>> print(out4.dtype)
+            paddle.int64
+
+            >>> # example 5:
+            >>> # dtype is float64 and the dtype of x is float32
+            >>> x = paddle.zeros((1,2)).astype("float32")
+            >>> out5 = paddle.randint_like(x, low=-5, high=5, dtype="float64")
+            >>> print(out5)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            [[3., 1.]])
+            >>> # doctest: -SKIP
+            >>> print(out5.dtype)
+            paddle.float64
+
+            >>> # example 6:
+            >>> # dtype is bool and the dtype of x is float32
+            >>> x = paddle.zeros((1,2)).astype("float32")
+            >>> out6 = paddle.randint_like(x, low=-5, high=5, dtype="bool")
+            >>> print(out6)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            [[False, True ]])
+            >>> # doctest: -SKIP
+            >>> print(out6.dtype)
+            paddle.bool
+
+            >>> # example 7:
+            >>> # dtype is int32 and the dtype of x is float32
+            >>> x = paddle.zeros((1,2)).astype("float32")
+            >>> out7 = paddle.randint_like(x, low=-5, high=5, dtype="int32")
+            >>> print(out7)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [[-2, -2]])
+            >>> # doctest: -SKIP
+            >>> print(out7.dtype)
+            paddle.int32
+
+            >>> # example 8:
+            >>> # dtype is int64 and the dtype of x is float32
+            >>> x = paddle.zeros((1,2)).astype("float32")
+            >>> out8 = paddle.randint_like(x, low=-5, high=5, dtype="int64")
+            >>> print(out8)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[-5,  4]])
+            >>> # doctest: -SKIP
+            >>> print(out8.dtype)
+            paddle.int64
+
+            >>> # example 9:
+            >>> # dtype is int64 and the dtype of x is bool
+            >>> x = paddle.zeros((1,2)).astype("bool")
+            >>> out9 = paddle.randint_like(x, low=-5, high=5, dtype="int64")
+            >>> print(out9)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[1, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[ 1, -2]])
+            >>> # doctest: -SKIP
+            >>> print(out9.dtype)
+            paddle.int64
 
     """
     if high is None:
@@ -1034,13 +1148,21 @@ def randperm(n, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            out1 = paddle.randperm(5)
-            # [4, 1, 2, 3, 0]  # random
+            >>> out1 = paddle.randperm(5)
+            >>> print(out1)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[5], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [3, 0, 1, 4, 2])
+            >>> #doctest: -SKIP
 
-            out2 = paddle.randperm(7, 'int32')
-            # [1, 6, 2, 0, 4, 3, 5]  # random
+            >>> out2 = paddle.randperm(7, 'int32')
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[7], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [3, 2, 0, 6, 5, 4, 1])
+            >>> #doctest: -SKIP
 
     """
     if not isinstance(dtype, core.VarDesc.VarType):
@@ -1091,29 +1213,41 @@ def rand(shape, dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            # example 1: attr shape is a list which doesn't contain Tensor.
-            out1 = paddle.rand(shape=[2, 3])
-            # [[0.451152  , 0.55825245, 0.403311  ],  # random
-            #  [0.22550228, 0.22106001, 0.7877319 ]]  # random
-
-            # example 2: attr shape is a list which contains Tensor.
-            dim1 = paddle.to_tensor(2, 'int64')
-            dim2 = paddle.to_tensor(3, 'int32')
-            out2 = paddle.rand(shape=[dim1, dim2, 2])
-            # [[[0.8879919 , 0.25788337],  # random
-            #   [0.28826773, 0.9712097 ],  # random
-            #   [0.26438272, 0.01796806]],  # random
-            #  [[0.33633623, 0.28654453],  # random
-            #   [0.79109055, 0.7305809 ],  # random
-            #   [0.870881  , 0.2984597 ]]]  # random
-
-            # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            shape_tensor = paddle.to_tensor([2, 3])
-            out3 = paddle.rand(shape_tensor)
-            # [[0.22920267, 0.841956  , 0.05981819],  # random
-            #  [0.4836288 , 0.24573246, 0.7516129 ]]  # random
+            >>> import paddle
+
+            >>> # example 1: attr shape is a list which doesn't contain Tensor.
+            >>> out1 = paddle.rand(shape=[2, 3])
+            >>> print(out1)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.68532258, 0.69431782, 0.44835982],
+             [0.13204314, 0.48128194, 0.36574543]])
+            >>> # doctest: -SKIP
+
+            >>> # example 2: attr shape is a list which contains Tensor.
+            >>> dim1 = paddle.to_tensor(2, 'int64')
+            >>> dim2 = paddle.to_tensor(3, 'int32')
+            >>> out2 = paddle.rand(shape=[dim1, dim2, 2])
+            >>> print(out2)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[[0.62102991, 0.45255184],
+              [0.81386960, 0.22463219],
+              [0.87946558, 0.28097662]],
+             [[0.36565998, 0.63203937],
+              [0.58640617, 0.92696166],
+              [0.85060406, 0.38138932]]])
+            >>> # doctest: -SKIP
+
+            >>> # example 3: attr shape is a Tensor, the data type must be int64 or int32.
+            >>> shape_tensor = paddle.to_tensor([2, 3])
+            >>> out3 = paddle.rand(shape_tensor)
+            >>> print(out3)
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.77650446, 0.12870903, 0.05153799],
+             [0.27029657, 0.03963696, 0.42487794]])
+            >>> # doctest: -SKIP
     """
     return uniform(shape, dtype, min=0.0, max=1.0, name=name)
 
@@ -1140,14 +1274,17 @@ def exponential_(x, lam=1.0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.set_device('cpu')
-            paddle.seed(100)
-
-            x = paddle.empty([2,3])
-            x.exponential_()
-            # [[0.80643415, 0.23211166, 0.01169797],
-            #  [0.72520673, 0.45208144, 0.30234432]]
+            >>> import paddle
+            >>> paddle.set_device('cpu')
+            >>> paddle.seed(100)
+
+            >>> x = paddle.empty([2,3])
+            >>> x.exponential_()
+            >>> # doctest: +SKIP("Random output")
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.80643415, 0.23211166, 0.01169797],
+             [0.72520679, 0.45208144, 0.30234432]])
+            >>> # doctest: -SKIP
 
     """
     if in_dynamic_mode():

From 8941263d45c7b37bf0fbe039bf9f5852b57ca0f6 Mon Sep 17 00:00:00 2001
From: Lu Qi <61354321+MarioLulab@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:21:19 +0800
Subject: [PATCH 053/115] [GLCC]Part-3: Support jit.save and jit.load for
 pylayer op (#57066)

* complete static_pylayer op

* finish static_pylayer op context manager

* finish single test

* append import path

* maybe modify test/ir/inference

* percept static_pylayer op in dy2st
---
 paddle/fluid/framework/prune.cc               | 181 +++++++--
 python/paddle/jit/dy2static/py_layer.py       |  15 +-
 python/paddle/static/io.py                    |  14 +
 python/paddle/static/nn/static_pylayer.py     |  35 +-
 test/dygraph_to_static/test_pylayer.py        | 344 ++++++++++++++++-
 test/legacy_test/test_jit_save_load.py        |   4 +-
 .../test_program_prune_backward.py            |  89 +++++
 test/legacy_test/test_prune.py                | 365 ++++++++++--------
 test/legacy_test/test_static_pylayer.py       | 252 +++++++++++-
 9 files changed, 1078 insertions(+), 221 deletions(-)

diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index d4c2021d5f6e1..93467b549d6e9 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -30,6 +30,8 @@ const char kRecurrent[] = "recurrent";  // NOLINT
 const char kStates[] = "states";        // NOLINT
 const char kExStates[] = "ex_states";   // NOLINT
 
+const char kPyLayer[] = "pylayer";  // NOLINT
+
 bool HasDependentInputVar(
     const proto::OpDesc& op_desc,
     const std::unordered_set<std::string>& dependent_vars) {
@@ -86,6 +88,23 @@ int GetSubBlockIndex(const proto::OpDesc& op_desc) {
   return -1;
 }
 
+void GetSubBlocksIndices(const proto::OpDesc& op_desc,
+                         std::vector<int>* indices) {
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCKS) {
+      PADDLE_ENFORCE_GT(
+          attr.blocks_idx_size(),
+          0,
+          platform::errors::NotFound(
+              "Attribute blocks is not found in operator %s", op_desc.type()));
+      indices->resize(attr.blocks_idx_size());
+      for (int i = 0; i < attr.blocks_idx_size(); i++) {
+        (*indices)[i] = attr.blocks_idx(i);
+      }
+    }
+  }
+}
+
 void SetSubBlockIndex(proto::OpDesc* op_desc, int sub_idx) {
   for (auto& attr : *op_desc->mutable_attrs()) {
     if (attr.type() == proto::AttrType::BLOCK) {
@@ -99,10 +118,43 @@ void SetSubBlockIndex(proto::OpDesc* op_desc, int sub_idx) {
   }
 }
 
+void SetSubBlocksIndices(proto::OpDesc* op_desc,
+                         const std::vector<int>& sub_indices) {
+  for (auto& attr : *op_desc->mutable_attrs()) {
+    if (attr.type() == proto::AttrType::BLOCKS) {
+      PADDLE_ENFORCE_GT(
+          attr.blocks_idx_size(),
+          0,
+          platform::errors::NotFound(
+              "Attribute blocks is not found in operator %s", op_desc->type()));
+      attr.clear_blocks_idx();
+      for (auto idx : sub_indices) {
+        attr.add_blocks_idx(idx);
+      }
+    }
+  }
+}
+
 bool HasSubBlock(const proto::OpDesc& op_desc) {
   return GetSubBlockIndex(op_desc) > 0;
 }
 
+bool HasSubBlocks(const proto::OpDesc& op_desc) {
+  // ``blocks_idx_size() == 0`` indicates no sub blocks.
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCKS) {
+      PADDLE_ENFORCE_GT(
+          attr.blocks_idx_size(),
+          0,
+          platform::errors::NotFound(
+              "Attribute blocks is not found in operator %s", op_desc.type()));
+      return true;
+    }
+  }
+
+  return false;
+}
+
 int GetOpRole(const proto::OpDesc& op_desc) {
   for (auto& attr : op_desc.attrs()) {
     if (attr.name() == OpProtoAndCheckerMaker::OpRoleAttrName()) {
@@ -150,14 +202,15 @@ int FindMapByValue(const std::map<int, int>& m, int val) {
 }
 
 // In other two cases, the op that has feed vars as output vars is dependent:
-// 1. op has subblock, like while/for/ifelse/recurrent
+// 1. op has subblock, like while/for/ifelse/recurrent/pylayer
 // 2. op is in subblock
 bool IsSubBlockDependent(const proto::OpDesc& op_desc,
                          const std::set<std::string>& feed_vars,
                          int parent_block_id) {
   for (auto& var : op_desc.outputs()) {
     for (auto& argu : var.arguments()) {
-      if ((HasSubBlock(op_desc) || parent_block_id != -1) &&
+      if ((HasSubBlock(op_desc) || HasSubBlocks(op_desc) ||
+           parent_block_id != -1) &&
           feed_vars.count(argu) != 0) {
         return true;
       }
@@ -289,7 +342,7 @@ void prune_impl(const proto::ProgramDesc& input,
     if (should_run[i]) {
       auto* op = op_field->Add();
       *op = input.blocks(block_id).ops(static_cast<int>(i));
-      if (HasSubBlock(*op)) {
+      if (HasSubBlock(*op) || HasSubBlocks(*op)) {
         VLOG(2) << "Pruning op which has sub block: " << op->type();
         // create sub_block_dependent_vars here to help prune the sub block
         std::unordered_set<std::string> sub_block_dependent_vars;
@@ -321,15 +374,41 @@ void prune_impl(const proto::ProgramDesc& input,
             }
           }
         }
-        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
-        // output_block_id is the idx of the current block in the output desc
-        prune_impl(input,
-                   output,
-                   GetSubBlockIndex(*op),
-                   output_block_id,
-                   &sub_block_dependent_vars,
-                   feed_var_names,
-                   pruned_origin_block_id_map);
+        if (HasSubBlock(*op)) {
+          // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
+          // output_block_id is the idx of the current block in the output desc
+          prune_impl(input,
+                     output,
+                     GetSubBlockIndex(*op),
+                     output_block_id,
+                     &sub_block_dependent_vars,
+                     feed_var_names,
+                     pruned_origin_block_id_map);
+        } else if (HasSubBlocks(*op)) {
+          // GetSubBlocksIndices(*op) are the indices of the sub_blocks in the
+          // input desc output_block_id is the idx of the current block in the
+          // output desc
+          std::vector<int> sub_indices;
+          GetSubBlocksIndices(*op, &sub_indices);
+          for (auto& sub_index : sub_indices) {
+            // create a copy of dependent_vars to avoid being overwrited by the
+            // other sub_block
+            std::unordered_set<std::string> dependent_vars_copy =
+                sub_block_dependent_vars;
+            prune_impl(input,
+                       output,
+                       sub_index,
+                       output_block_id,
+                       &dependent_vars_copy,
+                       feed_var_names,
+                       pruned_origin_block_id_map);
+          }
+        } else {
+          PADDLE_ENFORCE(false,
+                         platform::errors::PreconditionNotMet(
+                             "Attr Block or Blocks must exist when recursively "
+                             "calling prune_impl"));
+        }
       }
     }
   }
@@ -402,12 +481,29 @@ std::map<int, int> Prune(const proto::ProgramDesc& input,
         int origin_sub_idx = GetSubBlockIndex(op_desc);
         auto sub_idx =
             FindMapByValue(pruned_origin_block_id_map, origin_sub_idx);
-        PADDLE_ENFORCE_NE(sub_idx,
-                          -1,
-                          platform::errors::NotFound(
-                              "The origin sub block id should be found in "
-                              "pruned_progin_block_id_map"));
+        PADDLE_ENFORCE_NE(
+            sub_idx,
+            -1,
+            platform::errors::NotFound(
+                "The origin sub block id should be found in "
+                "pruned_progin_block_id_map when the op has sub_block"));
         SetSubBlockIndex(&op_desc, sub_idx);
+      } else if (HasSubBlocks(op_desc)) {
+        std::vector<int> origin_sub_indices;
+        GetSubBlocksIndices(op_desc, &origin_sub_indices);
+        std::vector<int> sub_indices;
+        for (int index : origin_sub_indices) {
+          auto sub_idx = FindMapByValue(pruned_origin_block_id_map, index);
+          PADDLE_ENFORCE_NE(
+              sub_idx,
+              -1,
+              platform::errors::NotFound(
+                  "The origin sub block id should be found in "
+                  "pruned_progin_block_id_map when the op has sub_blocks"));
+          sub_indices.push_back(sub_idx);
+        }
+
+        SetSubBlocksIndices(&op_desc, sub_indices);
       }
     }
   }
@@ -441,6 +537,19 @@ void PruneBackwardImpl(proto::BlockDesc* origin, proto::BlockDesc* pruned) {
       AppendOpInputVarNames(op_desc, &op_input_vars);
       AppendOpOutputVarNames(op_desc, &op_output_vars);
       *op = op_desc;
+
+      // if the type of op is "pylayer", we need to update the ``blocks``
+      // attribute because the backward block will be pruned
+      if (op->type() == kPyLayer && HasSubBlocks(*op)) {
+        std::vector<int> sub_indices;
+        GetSubBlocksIndices(*op, &sub_indices);
+        if (sub_indices.size() > 1) {
+          // sub_indices contains both forward block id and backward block id
+          std::vector<int> new_sub_indices(sub_indices.begin(),
+                                           sub_indices.end() - 1);
+          SetSubBlocksIndices(op, new_sub_indices);
+        }
+      }
     }
   }
 
@@ -471,9 +580,10 @@ std::tuple<framework::ProgramDesc, std::map<int, int>> PruneBackward(
   // Copy original ProgramDesc, origin can't be change
   framework::ProgramDesc origin_clone(origin);
 
-  // Step 1. check if the program contains grad loss operator.
-  // If not, the program need no pruning.
+  // Step 1. check if the program contains grad loss operator or pylayer
+  // operator. If not, the program need no pruning.
   bool has_loss_grad_op = false;
+  bool has_pylayer_op = false;
   std::queue<int> block_contains_loss;
   std::queue<int> block_contains_loss_grad;
   for (size_t i = 0; i < origin_clone.Size(); i++) {
@@ -485,13 +595,15 @@ std::tuple<framework::ProgramDesc, std::map<int, int>> PruneBackward(
                       static_cast<int>(OpRole::kLoss))) {
         op->SetIsTarget(false);
         has_loss_grad_op = true;
-        break;
+      }
+      if (op->Type() == kPyLayer) {
+        has_pylayer_op = true;
       }
     }
   }
 
   std::map<int, int> pruned_progin_block_id_map;
-  if (!has_loss_grad_op) {
+  if (!has_loss_grad_op && !has_pylayer_op) {
     // No pruning, fast return a copy of the origin ProgramDesc with an empty
     // map, means default mapped, i.e.{0:0, 1:1, ..., n:n}.
     return std::make_tuple(framework::ProgramDesc(origin_clone),
@@ -544,12 +656,29 @@ std::tuple<framework::ProgramDesc, std::map<int, int>> PruneBackward(
         int origin_sub_idx = GetSubBlockIndex(op_desc);
         auto sub_idx =
             FindMapByValue(pruned_progin_block_id_map, origin_sub_idx);
-        PADDLE_ENFORCE_NE(sub_idx,
-                          -1,
-                          platform::errors::NotFound(
-                              "The origin sub block id is not found in "
-                              "pruned_progin_block_id_map"));
+        PADDLE_ENFORCE_NE(
+            sub_idx,
+            -1,
+            platform::errors::NotFound(
+                "The origin sub block id is not found in "
+                "pruned_progin_block_id_map when the op has sub_block"));
         SetSubBlockIndex(&op_desc, sub_idx);
+      } else if (HasSubBlocks(op_desc)) {
+        std::vector<int> origin_sub_indices;
+        GetSubBlocksIndices(op_desc, &origin_sub_indices);
+        std::vector<int> sub_indices;
+        for (int index : origin_sub_indices) {
+          auto sub_idx = FindMapByValue(pruned_progin_block_id_map, index);
+          PADDLE_ENFORCE_NE(
+              sub_idx,
+              -1,
+              platform::errors::NotFound(
+                  "The origin sub block id should be found in "
+                  "pruned_progin_block_id_map when the op has sub_blocks"));
+          sub_indices.push_back(sub_idx);
+        }
+
+        SetSubBlocksIndices(&op_desc, sub_indices);
       }
     }
   }
diff --git a/python/paddle/jit/dy2static/py_layer.py b/python/paddle/jit/dy2static/py_layer.py
index 1d238e667c653..b32397b0aa3ee 100644
--- a/python/paddle/jit/dy2static/py_layer.py
+++ b/python/paddle/jit/dy2static/py_layer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import functools
+import inspect
 
 from paddle.base.framework import Variable
 from paddle.common_ops_import import LayerHelper
@@ -73,9 +74,19 @@ def __init__(self, dyfunc_self):
         )
 
     # NOTE: only support position args and Variables Now
-    def apply(self, *args):
+    def apply(self, *args, **kwargs):
+        # rearrange `position-args + keyword-args` into `position-args`
+        dyfunc_sig = inspect.signature(self.dyfunc_self.forward)
+        bound_args = dyfunc_sig.bind(self.dyfunc_self, *args, **kwargs)
+        bound_args.apply_defaults()
+        input_args = [
+            item
+            for i, item in enumerate(bound_args.arguments.values())
+            if i > 0
+        ]  # index 0 indicate `dyfunc_self` which shouldn't be put into `input_args`
+
         return static_pylayer(
             forward_fn=self.forward_fn_with_ctx,
-            inputs=list(args),
+            inputs=input_args,
             backward_fn=self.backward_fn_with_ctx,
         )
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 8f68f3f9e89bf..943e8525ba466 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -274,6 +274,20 @@ def normalize_program(program, feed_vars, fetch_vars, **kwargs):
         op.desc.set_is_target(False)
         if op.type == "feed" or op.type == "fetch":
             remove_op_idx.append(i)
+
+        if op.type == "pylayer":
+            sub_blocks_ids = op._blocks_attr_ids("blocks")
+            if len(sub_blocks_ids) > 1:
+                # pylayer op ``blocks`` attr contains forward block id and backward block id
+                backward_block_id = sub_blocks_ids[-1]
+                # remove backward block
+                copy_program.blocks.pop(backward_block_id)
+                # update attrs ``blocks``
+                reserverd_blocks = []
+                for block_id in sub_blocks_ids[:-1]:
+                    reserverd_blocks.append(copy_program.block(block_id))
+                op._update_desc_attr("blocks", reserverd_blocks)
+
     for idx in remove_op_idx[::-1]:
         global_block._remove_op(idx)
     copy_program.desc.flush()
diff --git a/python/paddle/static/nn/static_pylayer.py b/python/paddle/static/nn/static_pylayer.py
index 91d0f9d2351ff..3dcf35e50e54b 100644
--- a/python/paddle/static/nn/static_pylayer.py
+++ b/python/paddle/static/nn/static_pylayer.py
@@ -45,11 +45,13 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 class StaticPyLayerBlock:
     def __init__(self, inputs, name=None, pylayer_context=None):
-        for each_input in inputs:
-            check_type(each_input, "input", Variable, "StaticPyLayerBlock")
+        # used to specify the Variable type `Input` to `pylayer` op
+        self.fwd_inputs = [
+            each_input
+            for each_input in inputs
+            if isinstance(each_input, Variable)
+        ]  # filter non-Variable inputs
 
-        # used to specify the `Input` to `pylayer` op
-        self.fwd_inputs = inputs
         # used to specify the `Out` to `pylayer` op
         self.fwd_outputs = []
 
@@ -105,7 +107,7 @@ def complete_backward_block(self):
         parent_block = self.helper.main_program.block(inside_block.parent_idx)
 
         self._backward_block_id = inside_block.idx
-        # set OpRole to `backward`
+        # Set OpRole to `backward`. The operators marked as `backward` are expected to be pruned in PruneBackward.
         for op in inside_block.ops:
             op_role_attr_name = (
                 core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -234,8 +236,6 @@ def copy_var_from_parent_block(parent_block_var, layer_helper):
     return current_block_var
 
 
-# TODO(MarioLulab):
-# Need to support non-Variable in ``inputs``
 def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
     """
     This API returns ``forward_fn(inputs)``, and two sub-block are created based on
@@ -344,7 +344,9 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
         origin_output = forward_fn(*inputs)
         if origin_output is not None:
             output = map_structure(copy_to_parent_func, origin_output)
-            mgr.fwd_outputs = flatten(output)
+            mgr.fwd_outputs = [
+                x for x in flatten(output) if isinstance(x, Variable)
+            ]
         else:
             mgr.fwd_outputs = []
 
@@ -358,7 +360,7 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
         # **Create the backward input** from the output of the op to build the
         # backward block, and then delete it.
         grad_var_ins = []
-        for fwd_var in flatten(output):
+        for fwd_var in pylayer_block_manager.fwd_outputs:
             fwd_var_name = fwd_var.name
             bwd_var_name = _append_grad_suffix_(fwd_var_name)
             if not current_block.desc.has_var_recursive(fwd_var_name.encode()):
@@ -405,7 +407,7 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
                     but got {len(forward_input_names)} and {len(flat_grad_origin)}"
 
                 # Step4. Rename var name with suffix of "@GRAD"
-                for bwd_output_name, fwd_input_name in zip(
+                for bwd_output, fwd_input_name in zip(
                     flat_grad_origin, forward_input_names
                 ):
                     # NOTE(MarioLulab): Because `flat_grad_origin` are the Variables inside the backward block, which one by one corresponds
@@ -428,12 +430,13 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
                     # TODO(MarioLulab): We will validate the assumption above is whether a strong hypothesis or not.
 
                     # attach old var name into new
-                    bwd_out_new = _append_grad_suffix_(
-                        fwd_input_name
-                    )  # "X" => "X@GRAD"
-                    mgr.var_old_to_new[
-                        bwd_output_name.name
-                    ] = bwd_out_new  # e.g. "tmp_0.mean_0": "X@GRAD"
+                    if isinstance(bwd_output, Variable):
+                        bwd_out_new = _append_grad_suffix_(
+                            fwd_input_name
+                        )  # "X" => "X@GRAD"
+                        mgr.var_old_to_new[
+                            bwd_output.name
+                        ] = bwd_out_new  # e.g. "tmp_0.mean_0": "X@GRAD"
 
         # **Delete the backward input**
         for bwd_var in grad_var_ins:
diff --git a/test/dygraph_to_static/test_pylayer.py b/test/dygraph_to_static/test_pylayer.py
index 88558e3d628fb..ee2d1248e5f63 100644
--- a/test/dygraph_to_static/test_pylayer.py
+++ b/test/dygraph_to_static/test_pylayer.py
@@ -15,9 +15,12 @@
 """Tests for PyLayer of Dynamic-to-Static.
 Only test simple cases here."""
 
+import os
+import tempfile
 import unittest
 
 import numpy as np
+from legacy_test.test_jit_save_load import train
 
 import paddle
 from paddle.autograd.py_layer import PyLayer
@@ -51,7 +54,7 @@ def backward(ctx, dy):
 class scaled_layer_2(PyLayer):
     @staticmethod
     def forward(ctx, x1, x2):
-        y = x1 * x2
+        y = 3 * x1 + x2 / 5
         return y
 
     @staticmethod
@@ -75,6 +78,78 @@ def backward(ctx, dy):
         return grad
 
 
+class cus_tanh_2(PyLayer):
+    @staticmethod
+    def forward(ctx, x, func1, func2=paddle.square):
+        ctx.func = func2
+        y = func1(x)
+        ctx.save_for_backward(y)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        (y,) = ctx.saved_tensor()
+        grad = dy * (1 - ctx.func(y))
+        return grad
+
+
+class cus_tanh_3(PyLayer):
+    @staticmethod
+    def forward(ctx, x1, x2, func1, func2=paddle.square):
+        ctx.func = func2
+        y1 = func1(x1)
+        y2 = func1(x2)
+        ctx.save_for_backward(y1, y2)
+        return 1, None, y1, y2, ''
+
+    @staticmethod
+    def backward(ctx, dy1, dy2):
+        y1, y2 = ctx.saved_tensor()
+        re1 = dy1 * (1 - ctx.func(y1))
+        re2 = dy2 * (1 - paddle.square(y2))
+        return re1, None
+
+
+def user_defined_tanh(x):
+    y = paddle.tanh(x)
+    return y
+
+
+def user_defined_square(x):
+    y = paddle.square(x)
+    return y
+
+
+class cus_tanh_4(PyLayer):
+    @staticmethod
+    def forward(ctx, x, func, name="cus_tanh_4"):
+        ctx.func = func
+        y = user_defined_tanh(x)
+        ctx.save_for_backward(y)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        (y,) = ctx.saved_tensor()
+        grad = dy * (1 - ctx.func(y))
+        return grad
+
+
+class cus_sigmoid(PyLayer):
+    @staticmethod
+    def forward(ctx, x, func1, func2):
+        ctx.func = func2
+        y = 1 / (1 + func1(-x))
+        ctx.save_for_backward(x)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        (x,) = ctx.saved_tensor()
+        grad = dy * ctx.func(x) * (1 - ctx.func(x))
+        return grad
+
+
 class nested_layer(PyLayer):
     @staticmethod
     def forward(ctx, x1, x2):
@@ -92,9 +167,9 @@ def backward(ctx, dy):
 
 
 class SimpleNet_1(paddle.nn.Layer):
-    def __init__(self):
+    def __init__(self, in_size, out_size):
         super().__init__()
-        self.linear = paddle.nn.Linear(4, 8)
+        self.linear = paddle.nn.Linear(in_size, out_size)
 
     @paddle.jit.to_static
     def forward(self, data):
@@ -103,6 +178,30 @@ def forward(self, data):
         return z
 
 
+class SimpleNet_2(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(in_size, out_size)
+
+    def forward(self, x):
+        y = self.linear(x)
+        out = cus_tanh_2.apply(y, func1=paddle.tanh)
+        return out
+
+
+class SimpleNet_3(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(in_size, out_size)
+
+    def forward(self, x):
+        y = self.linear(x)
+        out = cus_sigmoid.apply(
+            y, func1=paddle.exp, func2=paddle.nn.functional.sigmoid
+        )
+        return out
+
+
 class SimpleNetInplace(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
@@ -115,6 +214,48 @@ def forward(self, data):
         return z
 
 
+class SimplePyLayerNet(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(in_size, out_size)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        y = self.linear(x)
+        out = cus_tanh_2.apply(y, func1=paddle.tanh)
+        out = paddle.mean(out)
+        return out
+
+
+class SimplePyLayerNetMultiIn(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(in_size, out_size)
+        self.linear2 = paddle.nn.Linear(in_size, out_size)
+
+    @paddle.jit.to_static
+    def forward(self, x1, x2):
+        y1 = self.linear1(x1)
+        y2 = self.linear1(x2)
+        out = cus_tanh_2.apply(y1, func1=paddle.tanh)
+        out = out + y2
+        out = paddle.mean(out)
+        return out
+
+
+class SimplePyLayerNetStopGrad(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(in_size, out_size)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        y = self.linear(x)
+        y.stop_gradient = True
+        out = cus_tanh_2.apply(y, func1=paddle.tanh)
+        return out
+
+
 class TestPyLayerBase(unittest.TestCase):
     def setUp(self):
         self.place = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
@@ -269,10 +410,69 @@ def test_func(x1, x2):
 
         self._run_and_compare(input1, input2)
 
+    def test_apply_kwargs_pylayer(self):
+        @paddle.jit.to_static
+        def test_func(x1, x2):
+            y = scaled_layer_2.apply(x1=x2, x2=x1)
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input2 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+
+        self._run_and_compare(input1, input2)
+
+    def test_non_variable_inputs(self):
+        @paddle.jit.to_static
+        def test_func(x):
+            y = cus_tanh_2.apply(x, func1=paddle.tanh)
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+
+        self._run_and_compare(input1)
+
+    def test_simple_pylayer_return_none_with_no_grad(self):
+        @paddle.jit.to_static
+        def test_func(input1, input2):
+            z = cus_tanh_3.apply(input1, input2, paddle.tanh, paddle.square)
+            z = z[2] + z[3]
+            return z
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input2 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+        input2.stop_gradient = True
+
+        self._run_and_compare(input1, input2)
+
+    def test_non_variable_inputs_and_userdefined_call(self):
+        @paddle.jit.to_static
+        def test_func(input1):
+            y = cus_tanh_4.apply(
+                input1, func=user_defined_square, name="cus_tanh_test"
+            )
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+
+        self._run_and_compare(input1)
+
 
 class TestPyLayerInsideNet(TestPyLayerBase):
     def test_single_in_single_out(self):
-        simple_net = SimpleNet_1()
+        simple_net = SimpleNet_1(in_size=4, out_size=8)
         self.dygraph_func = simple_net
 
         input1 = paddle.randn([3, 4]).astype("float32")
@@ -287,6 +487,142 @@ def test_inplace(self):
         input1.stop_gradient = False
         self._run_and_compare(input1)
 
+    def test_non_variable_args_pylayernet(self):
+        simple_net = SimplePyLayerNet(in_size=4, out_size=8)
+        self.dygraph_func = simple_net
+
+        input1 = paddle.randn([3, 4]).astype("float32")
+        input1.stop_gradient = False
+        self._run_and_compare(input1)
+
+    def test_pylayer_net_with_no_grad(self):
+        simple_net = SimplePyLayerNetMultiIn(in_size=4, out_size=8)
+        self.dygraph_func = simple_net
+
+        input1 = paddle.randn([3, 4]).astype("float32")
+        input2 = paddle.randn([3, 4]).astype("float32")
+        input1.stop_gradient = False
+        input2.stop_gradient = True
+        self._run_and_compare(input1, input2)
+
+
+class PyLayerTrainHelper(unittest.TestCase):
+    def setUp(self):
+        self.place = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
+
+    def _run_train(self, to_static, layer_builder, build_strategy=None):
+        """
+        Tests model decorated by `dygraph_to_static_output` in static graph mode. For users, the model is defined in dygraph mode and trained in static graph mode.
+        """
+        paddle.jit.enable_to_static(to_static)
+
+        paddle.set_device(self.place)
+        np.random.seed(SEED)
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+        # net = self.build_layer()
+        net = layer_builder()
+        if to_static:
+            net = paddle.jit.to_static(net, build_strategy=build_strategy)
+
+        _, _, avg_loss = train(net)
+        return avg_loss.numpy()
+
+
+class TestTrainingPyLayer(PyLayerTrainHelper):
+    def test_tanh_pylayer(self):
+        build_layer = lambda: SimpleNet_2(784, 20)
+
+        static_loss = self._run_train(to_static=True, layer_builder=build_layer)
+        dygraph_loss = self._run_train(
+            to_static=False, layer_builder=build_layer
+        )
+
+        np.testing.assert_allclose(
+            static_loss,
+            dygraph_loss,
+            rtol=1e-05,
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
+        )
+
+    def test_sigmoid_pylayer(self):
+        build_layer = lambda: SimpleNet_3(784, 20)
+
+        static_loss = self._run_train(to_static=True, layer_builder=build_layer)
+        dygraph_loss = self._run_train(
+            to_static=False, layer_builder=build_layer
+        )
+
+        np.testing.assert_allclose(
+            static_loss,
+            dygraph_loss,
+            rtol=1e-05,
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
+        )
+
+    def test_pylayer_net_no_grad(self):
+        build_layer = lambda: SimplePyLayerNetStopGrad(784, 20)
+
+        static_loss = self._run_train(to_static=True, layer_builder=build_layer)
+        dygraph_loss = self._run_train(
+            to_static=False, layer_builder=build_layer
+        )
+
+        np.testing.assert_allclose(
+            static_loss,
+            dygraph_loss,
+            rtol=1e-05,
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
+        )
+
+
+class TestPyLayerJitSaveLoad(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(
+            self.temp_dir.name, "test_pylayer/jit_save_model"
+        )
+        # enable dygraph mode
+        paddle.base.enable_dygraph()
+        # config seed
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def train_and_save_model(self, model_path=None):
+        layer = SimpleNet_1(784, 20)
+        example_inputs, layer, _ = train(layer)
+        final_model_path = model_path if model_path else self.model_path
+        orig_input_types = [type(x) for x in example_inputs]
+        paddle.jit.save(
+            layer=layer, path=final_model_path, input_spec=example_inputs
+        )
+        new_input_types = [type(x) for x in example_inputs]
+        self.assertEqual(orig_input_types, new_input_types)
+        return layer
+
+    def test_save_load(self):
+        # train and save model
+        train_layer = self.train_and_save_model()
+        # load model
+        loaded_layer = paddle.jit.load(self.model_path)
+        self.load_and_inference(train_layer, loaded_layer)
+
+    def load_and_inference(self, train_layer, infer_layer):
+        train_layer.eval()
+        infer_layer.eval()
+        # inference & compare
+        x = paddle.base.dygraph.to_variable(
+            np.random.random((1, 784)).astype('float32')
+        )
+        train_layer_result = train_layer(x).numpy()
+        infer_layer_result = infer_layer(x).numpy()
+
+        np.testing.assert_array_equal(train_layer_result, infer_layer_result)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_jit_save_load.py b/test/legacy_test/test_jit_save_load.py
index e2df76f475194..71c5c06a716b2 100644
--- a/test/legacy_test/test_jit_save_load.py
+++ b/test/legacy_test/test_jit_save_load.py
@@ -301,7 +301,7 @@ def forward_general(self, x):
 def train(layer, input_size=784, label_size=1):
     # create optimizer
     sgd = paddle.optimizer.SGD(
-        learning_rate=0.01, parameter_list=layer.parameters()
+        learning_rate=0.01, parameters=layer.parameters()
     )
     # create data loader
     train_loader = base.io.DataLoader.from_generator(capacity=5)
@@ -316,7 +316,7 @@ def train(layer, input_size=784, label_size=1):
         cost = layer(img)
 
         loss = paddle.nn.functional.cross_entropy(
-            cost, label, reduction='none', use_softmax=False
+            cost, label, reduction='none', use_softmax=True
         )
         avg_loss = paddle.mean(loss)
 
diff --git a/test/legacy_test/test_program_prune_backward.py b/test/legacy_test/test_program_prune_backward.py
index 237684e3b0bd9..581635d5a68ad 100755
--- a/test/legacy_test/test_program_prune_backward.py
+++ b/test/legacy_test/test_program_prune_backward.py
@@ -81,6 +81,27 @@ def loss2(pred, label):
     return avg_loss
 
 
+def pylayer_net(use_feed=None):
+    x = paddle.static.data(name="x", shape=[-1, 4], dtype='float32')
+    label = paddle.static.data('label', shape=[-1, 1], dtype='int64')
+
+    def forward_fn(x):
+        y = 3 * x
+        return y
+
+    def backward_fn(dy):
+        grad = paddle.exp(dy)
+        return grad
+
+    y = paddle.static.nn.static_pylayer(forward_fn, [x], backward_fn)
+    hidden = paddle.static.nn.fc(x=[y], size=4, activation="softmax")
+    loss = paddle.nn.functional.cross_entropy(
+        input=hidden, label=label, reduction='none', use_softmax=False
+    )
+    loss = paddle.mean(loss, name='mean_softmax_loss')
+    return loss
+
+
 def optimization_in_cond_net(with_optimize=False):
     x = paddle.static.data(name="x", shape=[-1, 4], dtype='float32')
     label = paddle.static.data('label', shape=[-1, 1], dtype='int64')
@@ -115,6 +136,31 @@ def loss2(opt, pred, label, with_optimize):
     return avg_loss
 
 
+def optimization_in_pylayer_net(with_optimize=False):
+    x = paddle.static.data(name="x", shape=[-1, 4], dtype='float32')
+    label = paddle.static.data('label', shape=[-1, 1], dtype='int64')
+
+    def forward_fn(x):
+        y = 3 * x
+        return y
+
+    def backward_fn(dy):
+        grad = paddle.exp(dy)
+        return grad
+
+    y = paddle.static.nn.static_pylayer(forward_fn, [x], backward_fn)
+    hidden = 3 * y
+    loss = paddle.nn.functional.softmax_with_cross_entropy(
+        logits=hidden, label=label
+    )
+    loss = paddle.mean(loss, name='mean_softmax_loss')
+    sgd = paddle.optimizer.SGD(learning_rate=0.1)
+    if with_optimize:
+        sgd.minimize(loss)
+
+    return loss
+
+
 class TestProgramPruneBackward(unittest.TestCase):
     def program_compare(self, program_a, program_b):
         assert isinstance(
@@ -249,6 +295,19 @@ def optimizer():
                 method=cond_net, feed_dict=feed_dict, optimizer=optimizer
             )
 
+    def test_pylayer(self):
+        def optimizer():
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            return optimizer
+
+        with self.program_scope_guard():
+            x_in = np.random.random(size=(10, 4)).astype('float32')
+            label_in = np.random.randint(1, size=(10, 1)).astype('int64')
+            feed_dict = {'x': x_in, 'label': label_in}
+            self.check_prune_correctness(
+                method=pylayer_net, feed_dict=feed_dict, optimizer=optimizer
+            )
+
     def test_optimization_in_cond(self):
         x_in = np.random.random(size=(10, 4)).astype('float32')
         label_in = np.random.randint(1, size=(10, 1)).astype('int64')
@@ -279,6 +338,36 @@ def test_optimization_in_cond(self):
         self.program_compare(test_prog_orig, test_prog_prune)
         self.assertEqual(loss_data_orig, loss_data_prune)
 
+    def test_optimization_in_pylayer(self):
+        x_in = np.random.random(size=(10, 4)).astype('float32')
+        label_in = np.random.randint(1, size=(10, 1)).astype('int64')
+        feed_dict = {'x': x_in, 'label': label_in}
+        with self.program_scope_guard():
+            loss = optimization_in_pylayer_net(False)
+            main_program = base.default_main_program()
+            test_prog_orig = main_program.clone(for_test=True)
+            place = core.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(base.default_startup_program())
+            (loss_data_orig,) = exe.run(
+                test_prog_orig, feed=feed_dict, fetch_list=[loss.name]
+            )
+
+        with self.program_scope_guard():
+            loss = optimization_in_pylayer_net(True)
+            main_program = base.default_main_program()
+            test_prog_prune = main_program.clone(for_test=True)
+
+            place = core.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(base.default_startup_program())
+            (loss_data_prune,) = exe.run(
+                test_prog_prune, feed=feed_dict, fetch_list=[loss.name]
+            )
+
+        self.program_compare(test_prog_orig, test_prog_prune)
+        self.assertEqual(loss_data_orig, loss_data_prune)
+
     @contextlib.contextmanager
     def program_scope_guard(self):
         prog = base.Program()
diff --git a/test/legacy_test/test_prune.py b/test/legacy_test/test_prune.py
index 00b96074ab5c2..91314d3c86b80 100644
--- a/test/legacy_test/test_prune.py
+++ b/test/legacy_test/test_prune.py
@@ -22,121 +22,82 @@
 from paddle.base import framework
 
 
-class TestPrune(unittest.TestCase):
-    def net(self):
-        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
-        x.desc.set_need_check_feed(False)
-        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
-        label.desc.set_need_check_feed(False)
-        y = paddle.static.nn.fc(x=[x], size=2, activation="softmax")
-        loss = paddle.nn.functional.cross_entropy(
-            input=y, label=label, reduction='none', use_softmax=False
-        )
-        loss = paddle.mean(x=loss)
-        return x, y, label, loss
-
-    def test_prune_with_input(self):
+class TestPruneBase(unittest.TestCase):
+    def run_net(self, net):
         program = framework.Program()
         startup_program = framework.Program()
-        block = program.global_block()
         with base.program_guard(program, startup_program):
-            (x, y, label, loss) = self.net()
-        self.assertEqual(len(block.ops), 5)
+            ret = net()
+
+        return ret, program
+
+    def check_prune_with_input(
+        self,
+        program,
+        feeded_var_names,
+        targets,
+        ops_before_pruned,
+        ops_after_pruned,
+    ):
+        block = program.global_block()
+        self.assertEqual(len(block.ops), len(ops_before_pruned))
         self.assertEqual(
             [op.type for op in block.ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_before_pruned,
         )
         pruned_program = program._prune_with_input(
-            feeded_var_names=[y.name, label.name], targets=[loss]
+            feeded_var_names=feeded_var_names, targets=targets
+        )
+        self.assertEqual(
+            len(pruned_program.global_block().ops), len(ops_after_pruned)
         )
-        self.assertEqual(len(pruned_program.global_block().ops), 2)
         self.assertEqual(
             [op.type for op in pruned_program.global_block().ops],
-            ["softmax_with_cross_entropy", "reduce_mean"],
+            ops_after_pruned,
         )
 
-    def test_prune(self):
-        program = framework.Program()
-        startup_program = framework.Program()
+    def check_prune(
+        self, program, targets, ops_before_pruned, ops_after_pruned
+    ):
         block = program.global_block()
-        with base.program_guard(program, startup_program):
-            (x, y, label, loss) = self.net()
-        self.assertEqual(len(block.ops), 5)
+        self.assertEqual(len(block.ops), len(ops_before_pruned))
         self.assertEqual(
             [op.type for op in block.ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_before_pruned,
+        )
+        pruned_program = program._prune(targets=targets)
+        self.assertEqual(
+            len(pruned_program.global_block().ops), len(ops_after_pruned)
         )
-        pruned_program = program._prune(targets=[loss])
-        self.assertEqual(len(pruned_program.global_block().ops), 5)
         self.assertEqual(
             [op.type for op in pruned_program.global_block().ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_after_pruned,
         )
 
-    def test_prune_target_not_list(self):
-        program = framework.Program()
-        startup_program = framework.Program()
+    def check_prune_target_not_list(
+        self, program, targets, ops_before_pruned, ops_after_pruned
+    ):
         block = program.global_block()
-        with base.program_guard(program, startup_program):
-            (x, y, label, loss) = self.net()
-        self.assertEqual(len(block.ops), 5)
+        self.assertEqual(len(block.ops), len(ops_before_pruned))
         self.assertEqual(
             [op.type for op in block.ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_before_pruned,
+        )
+        pruned_program = program._prune(targets=targets)
+        self.assertEqual(
+            len(pruned_program.global_block().ops), len(ops_after_pruned)
         )
-        pruned_program = program._prune(targets=loss)
-        self.assertEqual(len(pruned_program.global_block().ops), 5)
         self.assertEqual(
             [op.type for op in pruned_program.global_block().ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_after_pruned,
         )
 
-    def test_prune_target_none(self):
-        program = framework.Program()
-        startup_program = framework.Program()
+    def check_prune_target_none(self, program, ops_before_pruned):
         block = program.global_block()
-        with base.program_guard(program, startup_program):
-            (x, y, label, loss) = self.net()
-        self.assertEqual(len(block.ops), 5)
+        self.assertEqual(len(block.ops), len(ops_before_pruned))
         self.assertEqual(
             [op.type for op in block.ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_before_pruned,
         )
         try:
             pruned_program = program._prune(targets=None)
@@ -147,6 +108,96 @@ def test_prune_target_none(self):
             )
 
 
+class TestPrune(TestPruneBase):
+    def net(self):
+        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+        x.desc.set_need_check_feed(False)
+        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+        label.desc.set_need_check_feed(False)
+        y = paddle.static.nn.fc(x=[x], size=2, activation="softmax")
+        loss = paddle.nn.functional.cross_entropy(
+            input=y, label=label, reduction='none', use_softmax=False
+        )
+        loss = paddle.mean(x=loss)
+        return x, y, label, loss
+
+    def test_prune_with_input(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = ["softmax_with_cross_entropy", "reduce_mean"]
+        (x, y, label, loss), program = self.run_net(self.net)
+
+        self.check_prune_with_input(
+            program,
+            [y.name, label.name],
+            [loss],
+            ops_before_pruned,
+            ops_after_pruned,
+        )
+
+    def test_prune(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        (x, y, label, loss), program = self.run_net(self.net)
+
+        self.check_prune(program, [loss], ops_before_pruned, ops_after_pruned)
+
+    def test_prune_target_not_list(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        (x, y, label, loss), program = self.run_net(self.net)
+
+        self.check_prune_target_not_list(
+            program, loss, ops_before_pruned, ops_after_pruned
+        )
+
+    def test_prune_target_none(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        (x, y, label, loss), program = self.run_net(self.net)
+        self.check_prune_target_none(program, ops_before_pruned)
+
+
 def mock(self, program, feed, fetch, optimize_ops):
     self.prune_called_times += 1
     return program
@@ -160,77 +211,83 @@ def _mock_guard(mock):
     base.Executor._prune_program = original
 
 
+def net1():
+    x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+    x.desc.set_need_check_feed(False)
+    label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+    label.desc.set_need_check_feed(False)
+    w_param_attrs = base.ParamAttr(
+        name="fc_weight",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+    y = paddle.static.nn.fc(
+        x=[x], size=2, activation="softmax", weight_attr=w_param_attrs
+    )
+    loss1 = paddle.nn.functional.cross_entropy(
+        input=y, label=label, reduction='none', use_softmax=False
+    )
+    loss1 = paddle.mean(x=loss1)
+    loss2 = paddle.nn.functional.cross_entropy(
+        input=y, label=label, reduction='none', use_softmax=False
+    )
+    loss2 = paddle.mean(x=loss2)
+    loss1.persistable = True
+    loss2.persistable = True
+    return x, y, label, loss1, loss2, w_param_attrs
+
+
+def net2():
+    x1 = paddle.static.data(name='x1', shape=[-1, 2], dtype='float32')
+    x1.desc.set_need_check_feed(False)
+    x2 = paddle.static.data(name='x2', shape=[-1, 2], dtype='float32')
+    x2.desc.set_need_check_feed(False)
+    label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+    label.desc.set_need_check_feed(False)
+    w1_param_attrs = base.ParamAttr(
+        name="fc_weight1",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+    w2_param_attrs = base.ParamAttr(
+        name="fc_weight2",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+    y1 = paddle.static.nn.fc(
+        x=[x1], size=2, activation="softmax", weight_attr=w1_param_attrs
+    )
+    y2 = paddle.static.nn.fc(
+        x=[x2], size=2, activation="softmax", weight_attr=w2_param_attrs
+    )
+    loss1 = paddle.nn.functional.cross_entropy(
+        input=y1, label=label, reduction='none', use_softmax=False
+    )
+    loss1 = paddle.mean(x=loss1)
+    loss2 = paddle.nn.functional.cross_entropy(
+        input=y2, label=label, reduction='none', use_softmax=False
+    )
+    loss2 = paddle.mean(x=loss2)
+    return (
+        x1,
+        x2,
+        y1,
+        y2,
+        label,
+        loss1,
+        loss2,
+        w1_param_attrs,
+        w2_param_attrs,
+    )
+
+
 class TestExecutorRunAutoPrune(unittest.TestCase):
-    def net1(self):
-        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
-        x.desc.set_need_check_feed(False)
-        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
-        label.desc.set_need_check_feed(False)
-        w_param_attrs = base.ParamAttr(
-            name="fc_weight",
-            learning_rate=0.5,
-            initializer=paddle.nn.initializer.Constant(1.0),
-            trainable=True,
-        )
-        y = paddle.static.nn.fc(
-            x=[x], size=2, activation="softmax", weight_attr=w_param_attrs
-        )
-        loss1 = paddle.nn.functional.cross_entropy(
-            input=y, label=label, reduction='none', use_softmax=False
-        )
-        loss1 = paddle.mean(x=loss1)
-        loss2 = paddle.nn.functional.cross_entropy(
-            input=y, label=label, reduction='none', use_softmax=False
-        )
-        loss2 = paddle.mean(x=loss2)
-        loss1.persistable = True
-        loss2.persistable = True
-        return x, y, label, loss1, loss2, w_param_attrs
-
-    def net2(self):
-        x1 = paddle.static.data(name='x1', shape=[-1, 2], dtype='float32')
-        x1.desc.set_need_check_feed(False)
-        x2 = paddle.static.data(name='x2', shape=[-1, 2], dtype='float32')
-        x2.desc.set_need_check_feed(False)
-        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
-        label.desc.set_need_check_feed(False)
-        w1_param_attrs = base.ParamAttr(
-            name="fc_weight1",
-            learning_rate=0.5,
-            initializer=paddle.nn.initializer.Constant(1.0),
-            trainable=True,
-        )
-        w2_param_attrs = base.ParamAttr(
-            name="fc_weight2",
-            learning_rate=0.5,
-            initializer=paddle.nn.initializer.Constant(1.0),
-            trainable=True,
-        )
-        y1 = paddle.static.nn.fc(
-            x=[x1], size=2, activation="softmax", weight_attr=w1_param_attrs
-        )
-        y2 = paddle.static.nn.fc(
-            x=[x2], size=2, activation="softmax", weight_attr=w2_param_attrs
-        )
-        loss1 = paddle.nn.functional.cross_entropy(
-            input=y1, label=label, reduction='none', use_softmax=False
-        )
-        loss1 = paddle.mean(x=loss1)
-        loss2 = paddle.nn.functional.cross_entropy(
-            input=y2, label=label, reduction='none', use_softmax=False
-        )
-        loss2 = paddle.mean(x=loss2)
-        return (
-            x1,
-            x2,
-            y1,
-            y2,
-            label,
-            loss1,
-            loss2,
-            w1_param_attrs,
-            w2_param_attrs,
-        )
+    def setUp(self):
+        self.net1 = net1
+        self.net2 = net2
 
     def test_not_prune(self):
         """
diff --git a/test/legacy_test/test_static_pylayer.py b/test/legacy_test/test_static_pylayer.py
index 3a1634e92bf58..8b193d6e087be 100644
--- a/test/legacy_test/test_static_pylayer.py
+++ b/test/legacy_test/test_static_pylayer.py
@@ -16,6 +16,7 @@
 import unittest
 
 import numpy as np
+from legacy_test.test_prune import TestExecutorRunAutoPrune, TestPruneBase
 
 import paddle
 from paddle import base
@@ -27,6 +28,9 @@
 
 
 class TestStaticPyLayerInputOutput(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
     def test_return_single_var(self):
         """
         pseudocode:
@@ -34,8 +38,6 @@ def test_return_single_var(self):
         y = 3 * x
         """
 
-        paddle.enable_static()
-
         def forward_fn(x):
             return 3 * x
 
@@ -65,8 +67,6 @@ def test_return_0d_tensor(self):
         y = 3 * x
         """
 
-        paddle.enable_static()
-
         def forward_fn(x):
             return 3 * x
 
@@ -96,8 +96,6 @@ def test_0d_tensor_backward(self):
         dx = -5 * dy
         '''
 
-        paddle.enable_static()
-
         def forward_fn(x):
             return 3 * x
 
@@ -132,8 +130,6 @@ def backward_fn(dy):
         self.assertEqual(x_grad.shape, ())
 
     def test_return_var_typle(self):
-        paddle.enable_static()
-
         def forward_fn(a, b):
             return 3 * a, -2 * b
 
@@ -168,8 +164,6 @@ def forward_fn(a, b):
         )
 
     def test_return_forward_none(self):
-        paddle.enable_static()
-
         input_shape = (1, 3)
 
         def forward_fn(x):
@@ -198,8 +192,6 @@ def test_wrong_structure_exception(self):
         wrong number of inputs and outputs returned by ``forward_fn`` and ``backward_fn``
         """
 
-        paddle.enable_static()
-
         def forward_fn(a, b):
             return 3 * a, -b, paddle.mean(b)
 
@@ -232,6 +224,9 @@ def backward_fn(daout, dbout):
 
 
 class TestControlFlowNestedStaticPyLayer(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
     def test_cond_inside_static_pylayer(self):
         """
         forward propagation:
@@ -256,8 +251,6 @@ def backward_fn(diout, daout):
                 return daout_scaled, daout * daout
         """
 
-        paddle.enable_static()
-
         def forward_fn(i, a):
             return i, paddle.static.nn.cond(
                 i < 5.0, lambda: paddle.add(a, a), lambda: paddle.subtract(a, a)
@@ -343,9 +336,10 @@ def backward_fn(diout, daout):
 
 
 class TestStaticPyLayerBackward(unittest.TestCase):
-    def test_identity_backward(self):
+    def setUp(self):
         paddle.enable_static()
 
+    def test_identity_backward(self):
         def forward_fn(x):
             return x
 
@@ -405,8 +399,6 @@ def test_static_pylayer_backward(self):
         dx = tanh(dy)
         '''
 
-        paddle.enable_static()
-
         def forward_fn(x):
             return 3 * x
 
@@ -455,5 +447,231 @@ def backward_fn(dy):
         )
 
 
+class TestStaticPyLayerPrune(TestPruneBase):
+    def setUp(self):
+        paddle.enable_static()
+
+    def net(self):
+        def forward_fn(x):
+            y = 3 * x
+            return y
+
+        def backward_fn(dy):
+            grad = paddle.exp(dy)
+            return grad
+
+        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+        x.desc.set_need_check_feed(False)
+        hidden = paddle.static.nn.fc(x=[x], size=4, activation="softmax")
+        y = paddle.static.nn.static_pylayer(forward_fn, [hidden], backward_fn)
+        loss = paddle.mean(y)
+        return x, hidden, y, loss
+
+    def net_with_weight(self):
+        def forward_fn(x):
+            y = 3 * x
+            return y
+
+        def backward_fn(dy):
+            grad = paddle.exp(dy)
+            return grad
+
+        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+        x.desc.set_need_check_feed(False)
+        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+        label.desc.set_need_check_feed(False)
+        w_param_attrs = base.ParamAttr(
+            name="fc_weight",
+            learning_rate=0.5,
+            initializer=paddle.nn.initializer.Constant(1.0),
+            trainable=True,
+        )
+
+        y = paddle.static.nn.static_pylayer(forward_fn, [x], backward_fn)
+        hidden = paddle.static.nn.fc(
+            x=[y], size=4, activation="softmax", weight_attr=w_param_attrs
+        )
+        loss1 = paddle.nn.functional.cross_entropy(
+            input=hidden, label=label, reduction='none', use_softmax=False
+        )
+        loss1 = paddle.mean(x=loss1)
+        loss2 = paddle.nn.functional.cross_entropy(
+            input=hidden, label=label, reduction='none', use_softmax=False
+        )
+        loss2 = paddle.mean(x=loss2)
+        loss1.persistable = True
+        loss2.persistable = True
+
+        return x, hidden, label, loss1, loss2, w_param_attrs
+
+    def test_prune_with_input(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = ["pylayer", "reduce_mean"]
+
+        (x, hidden, y, loss), program = self.run_net(self.net)
+
+        self.check_prune_with_input(
+            program, [hidden.name], [loss], ops_before_pruned, ops_after_pruned
+        )
+
+    def test_prune(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        (x, hidden, y, loss), program = self.run_net(self.net)
+
+        self.check_prune(program, [loss], ops_before_pruned, ops_after_pruned)
+
+    def test_prune_target_not_list(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        (x, hidden, y, loss), program = self.run_net(self.net)
+        self.check_prune_target_not_list(
+            program, loss, ops_before_pruned, ops_after_pruned
+        )
+
+    def test_prune_target_none(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        (x, hidden, y, loss), program = self.run_net(self.net)
+        self.check_prune_target_none(program, ops_before_pruned)
+
+
+def net_with_weight1():
+    def forward_fn(x):
+        y = 3 * x
+        return y
+
+    def backward_fn(dy):
+        grad = paddle.exp(dy)
+        return grad
+
+    x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+    x.desc.set_need_check_feed(False)
+    label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+    label.desc.set_need_check_feed(False)
+    w_param_attrs = base.ParamAttr(
+        name="fc_weight",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+
+    y = paddle.static.nn.static_pylayer(forward_fn, [x], backward_fn)
+    hidden = paddle.static.nn.fc(
+        x=[y], size=4, activation="softmax", weight_attr=w_param_attrs
+    )
+    loss1 = paddle.nn.functional.cross_entropy(
+        input=hidden, label=label, reduction='none', use_softmax=False
+    )
+    loss1 = paddle.mean(x=loss1)
+    loss2 = paddle.nn.functional.cross_entropy(
+        input=hidden, label=label, reduction='none', use_softmax=False
+    )
+    loss2 = paddle.mean(x=loss2)
+    loss1.persistable = True
+    loss2.persistable = True
+
+    return x, hidden, label, loss1, loss2, w_param_attrs
+
+
+def net_with_weight2():
+    def forward_fn(x):
+        y = 3 * x
+        return y
+
+    def backward_fn(dy):
+        grad = paddle.exp(dy)
+        return grad
+
+    x1 = paddle.static.data(name='x1', shape=[-1, 2], dtype='float32')
+    x1.desc.set_need_check_feed(False)
+    x2 = paddle.static.data(name='x2', shape=[-1, 2], dtype='float32')
+    x2.desc.set_need_check_feed(False)
+    label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+    label.desc.set_need_check_feed(False)
+    w1_param_attrs = base.ParamAttr(
+        name="fc_weight1",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+    w2_param_attrs = base.ParamAttr(
+        name="fc_weight2",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+
+    y1 = paddle.static.nn.static_pylayer(forward_fn, [x1], backward_fn)
+    hidden1 = paddle.static.nn.fc(
+        x=[y1], size=4, activation="softmax", weight_attr=w1_param_attrs
+    )
+    y2 = paddle.static.nn.static_pylayer(forward_fn, [x2], backward_fn)
+    hidden2 = paddle.static.nn.fc(
+        x=[y2], size=4, activation="softmax", weight_attr=w2_param_attrs
+    )
+
+    loss1 = paddle.nn.functional.cross_entropy(
+        input=hidden1, label=label, reduction='none', use_softmax=False
+    )
+    loss1 = paddle.mean(x=loss1)
+    loss2 = paddle.nn.functional.cross_entropy(
+        input=hidden2, label=label, reduction='none', use_softmax=False
+    )
+    loss2 = paddle.mean(x=loss2)
+    loss1.persistable = True
+    loss2.persistable = True
+
+    return x1, x2, y1, y2, label, loss1, loss2, w1_param_attrs, w2_param_attrs
+
+
+class TestStaticPyLayerExecutorAutoPrune(TestExecutorRunAutoPrune):
+    def setUp(self):
+        paddle.enable_static()
+        self.net1 = net_with_weight1
+        self.net2 = net_with_weight2
+
+
 if __name__ == '__main__':
     unittest.main()

From 6bed2dce8f80729c9fc73d6573855d288e6c9dec Mon Sep 17 00:00:00 2001
From: iLeGend <824040212@qq.com>
Date: Fri, 22 Sep 2023 10:25:36 +0800
Subject: [PATCH 054/115] [NewComm] No.8 compatiable upgrade for
 fused_multi_transformer op (#57564)

---
 .../fused/fused_multi_transformer_op.cu.h     | 49 +++++++++++++++++--
 .../test_fused_multi_transformer_op.py        | 10 ++++
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index f27644f1abd0d..a81a38ed3877f 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -24,6 +24,7 @@ limitations under the License. */
 
 #include <cub/cub.cuh>
 
+#include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
@@ -34,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -42,6 +44,8 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
@@ -78,10 +82,47 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
     const void *sendbuff = tensor.data<T>();
     auto place = ctx.GetPlace();
     void *recvbuff = tensor.mutable_data<T>(place);
-    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
-    auto stream = ctx.stream();
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, count, dtype, ncclSum, comm->comm(), stream));
+    gpuStream_t stream = nullptr;
+    platform::NCCLComm *comm = nullptr;
+    phi::distributed::NCCLCommContext *comm_ctx = nullptr;
+
+    const auto &comm_context_manager =
+        phi::distributed::CommContextManager::GetInstance();
+
+    if (FLAGS_dynamic_static_unified_comm) {
+      // Use New Communication Library
+      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "You choose to use new communication library by "
+                            "setting environment "
+                            "variable FLAGS_dynamic_static_unified_comm True. "
+                            "But ring_id(%d) is "
+                            "not found in comm_context_manager.",
+                            std::to_string(ring_id)));
+      comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+          comm_context_manager.Get(std::to_string(ring_id)));
+      PADDLE_ENFORCE_NE(comm_ctx,
+                        nullptr,
+                        platform::errors::Unavailable(
+                            "NCCLCommContext is nullptr, collective op should "
+                            "has ring_id attr."));
+
+      stream = comm_ctx->GetStream();
+
+      VLOG(3) << "new comm_context_manager has ring_id" << ring_id;
+    } else {
+      comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+
+      stream = ctx.stream();
+      VLOG(3) << "old NCCLCommContext has ring_id " << ring_id;
+    }
+    if (comm_ctx) {
+      comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream);
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+          sendbuff, recvbuff, count, dtype, ncclSum, comm->comm(), stream));
+    }
   }
 #else
   PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/test/legacy_test/test_fused_multi_transformer_op.py b/test/legacy_test/test_fused_multi_transformer_op.py
index d7bab80a41b80..577957e8b0e41 100644
--- a/test/legacy_test/test_fused_multi_transformer_op.py
+++ b/test/legacy_test/test_fused_multi_transformer_op.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import random
 import unittest
 
@@ -38,6 +39,7 @@
 
 class TestFusedMultiTransformerOp(OpTest):
     def setUp(self):
+        self.with_new_comm()
         self.config()
         self.generate_input_data()
 
@@ -108,6 +110,9 @@ def setUp(self):
         self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
         self.activation = getattr(F, self.act_method)
 
+    def with_new_comm(self):
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
+
     def config(self):
         # for debug
         self.debug = False
@@ -1125,6 +1130,11 @@ def test_fused_multi_transformer_op(self):
             )
 
 
+class TestFusedMultiTransformerOpWithNewComm(TestFusedMultiTransformerOp):
+    def with_new_comm(self):
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "1"
+
+
 class TestFusedMultiTransformerOpRotaryFP16(TestFusedMultiTransformerOp):
     def config(self):
         super().config()

From 521f8e6220aa81c6319f6a1dddb6ebb191783964 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Fri, 22 Sep 2023 10:34:13 +0800
Subject: [PATCH 055/115] [SOT][3.11] add `__repr__` for
 `PyInterpreterFrameProxy` (#57588)

---
 paddle/fluid/pybind/jit.cc | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc
index 688fe7c670370..8d4d62f738211 100644
--- a/paddle/fluid/pybind/jit.cc
+++ b/paddle/fluid/pybind/jit.cc
@@ -57,6 +57,12 @@ namespace pybind {
 typedef _PyInterpreterFrame FrameObject;
 #define CALL_STAT_INC(name) ((void)0)
 
+int Internal_PyInterpreterFrame_GetLine(_PyInterpreterFrame *frame);
+static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame,
+                                         int opcode,
+                                         int oparg);
+int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame);
+
 // clang-format off
 // Define a proxy PyObject to access _PyInterpreterFrame's properties.
 // It will be passed as an argument to the eval frame's callback.
@@ -86,6 +92,20 @@ DECLARE_PROXY_PROPERTY(f_locals)
 DECLARE_PROXY_PROPERTY(f_globals)
 DECLARE_PROXY_PROPERTY(f_builtins)
 
+// Refer to
+// https://github.com/python/cpython/blob/9414ddf91898892f3f6a672ae946931ee4b3ceb7/Objects/frameobject.c#L953-L961
+static PyObject *PyInterpreterFrameProxy_method_repr(
+    PyInterpreterFrameProxy *self) {
+  int lineno = Internal_PyInterpreterFrame_GetLine(self->frame);
+  PyCodeObject *code = self->frame->f_code;
+  return PyUnicode_FromFormat(
+      "<PyInterpreterFrameProxy at %p, file %R, line %d, code %S>",
+      self,
+      code->co_filename,
+      lineno,
+      code->co_name);
+}
+
 static PyGetSetDef PyInterpreterFrameProxy_properties[] = {
     REGISTER_PROXY_PROPERTY(f_code),
     REGISTER_PROXY_PROPERTY(f_locals),
@@ -100,6 +120,7 @@ static PyTypeObject PyInterpreterFrameProxyType = {
     .tp_name = "paddle.framework.core.PyInterpreterFrameProxy",
     .tp_doc = PyDoc_STR("A proxy object for _PyInterpreterFrame, "
                         "it's only define all properties we need."),
+    .tp_repr = reinterpret_cast<reprfunc>(PyInterpreterFrameProxy_method_repr),
     .tp_basicsize = sizeof(PyInterpreterFrameProxy),
     .tp_itemsize = 0,
     .tp_flags = Py_TPFLAGS_DEFAULT,
@@ -122,6 +143,11 @@ PyInterpreterFrameProxy *PyInterpreterFrameProxy_New(
 
 // We copy some cpython internal API from cpython project.
 // To avoid name conflict, we use "Internal_" prefix to mark them.
+int Internal_PyInterpreterFrame_GetLine(_PyInterpreterFrame *frame) {
+  int addr = _PyInterpreterFrame_LASTI(frame) * sizeof(_Py_CODEUNIT);
+  return PyCode_Addr2Line(frame->f_code, addr);
+}
+
 static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame,
                                          int opcode,
                                          int oparg) {

From bcc3305b59a10855aa9f389f819d75ea395dd305 Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:48:46 +0800
Subject: [PATCH 056/115] [Semi-Auto] Adapt layer_norm spmd rule to phi
 (#57374)

* adapt layer_norm spmd rule to phi

* modify api in unit test

* bug fix

* fix bug in cpp unit test
---
 .../spmd_rules/layer_norm_spmd_rule.cc        | 280 -----------------
 .../spmd_rules/layer_norm_spmd_rule.h         |  41 ---
 .../auto_parallel/spmd_rules/rules.h          |   4 -
 .../auto_parallel/inferspmd_utils.cc          |   3 +
 .../auto_parallel/inferspmd_utils.h           |   2 +
 paddle/phi/infermeta/spmd_rules/layer_norm.cc | 282 ++++++++++++++++++
 paddle/phi/infermeta/spmd_rules/layer_norm.h  |  39 +++
 paddle/phi/infermeta/spmd_rules/rules.h       |   7 +
 .../spmd_rules/test_layer_norm_rule.py        | 117 ++++++--
 test/cpp/auto_parallel/spmd_rule_test.cc      |  64 ++--
 10 files changed, 457 insertions(+), 382 deletions(-)
 delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.cc
 delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h
 create mode 100644 paddle/phi/infermeta/spmd_rules/layer_norm.cc
 create mode 100644 paddle/phi/infermeta/spmd_rules/layer_norm.h

diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.cc
deleted file mode 100644
index 81f25a8d6ed88..0000000000000
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
-
-#include "paddle/phi/core/distributed/auto_parallel/utils.h"
-
-namespace paddle {
-namespace distributed {
-namespace auto_parallel {
-using phi::distributed::auto_parallel::str_join;
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-LayerNormSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
-                                const paddle::framework::AttributeMap& attrs) {
-  // step0: verify input args based on layer_norm logic
-  auto input_specs_size = input_specs.size();
-  PADDLE_ENFORCE_EQ(
-      input_specs_size,
-      3,
-      phi::errors::InvalidArgument(
-          "The size of InputSpec of layer_norm should be 3, but got [%d].",
-          input_specs_size));
-  auto x_shape = input_specs[0].shape();
-  auto scale_shape = input_specs[1].shape();
-  auto bias_shape = input_specs[2].shape();
-  int x_ndim = static_cast<int>(x_shape.size());
-  int scale_ndim = static_cast<int>(scale_shape.size());
-  int bias_ndim = static_cast<int>(bias_shape.size());
-
-  PADDLE_ENFORCE_EQ(
-      scale_ndim,
-      1,
-      phi::errors::InvalidArgument(
-          "The ndim of scale in layer_norm should be 1, but got [%d].",
-          scale_ndim));
-
-  PADDLE_ENFORCE_EQ(
-      bias_ndim,
-      1,
-      phi::errors::InvalidArgument(
-          "The ndim of bias in layer_norm should be 1, but got [%d].",
-          bias_ndim));
-
-  auto x_dims_mapping = input_specs[0].dist_attr().dims_mapping();
-  auto scale_dims_mapping = input_specs[1].dist_attr().dims_mapping();
-  auto bias_dims_mapping = input_specs[2].dist_attr().dims_mapping();
-
-  auto x_dist_attr_src = input_specs[0].dist_attr();
-
-  std::vector<TensorDistAttr> input_dist_attrs;
-  input_dist_attrs.reserve(input_specs.size());
-
-  int begin_norm_axis = ExtractAttr<int>("begin_norm_axis", attrs);
-
-  VLOG(4) << "LayerNormSPMDRule InferForward Inputs: "
-          << "x shape: [" << str_join(x_shape) << "], x_dims_mapping: ["
-          << str_join(x_dims_mapping) << "]; scale shape: ["
-          << str_join(scale_shape) << "], scale_dims_mapping: ["
-          << str_join(scale_dims_mapping) << "]; bias shape: ["
-          << str_join(bias_shape) << "], bias_dims_mapping: ["
-          << str_join(bias_dims_mapping) << "]; begin_norm_axis: ["
-          << begin_norm_axis << "]; ";
-
-  // step1: build Einsum Notation
-  // ijk,k,k->ijk,z,z (x,scale,bias->out,mean,variance, begin_norm_axis=2, z=ij)
-  // ijkl,y(kl),y(kl)->ijkl,z(ij),z(ij) (x,scale,bias->out,mean,variance,
-  // begin_norm_axis=2, z=ij, y=kl)
-  std::string x_axes = "";
-  for (auto i = 0; i < x_ndim; ++i) {
-    x_axes += static_cast<char>(static_cast<int>('k') - begin_norm_axis + i);
-  }
-
-  std::string scale_axes;
-  std::string bias_axes;
-  if (x_ndim - begin_norm_axis == 1) {
-    scale_axes = "k";
-    bias_axes = "k";
-  } else {
-    // z = x_axes.substr(begin_norm_axis, x_ndim - begin_norm_axis)
-    scale_axes = "y";
-    bias_axes = "y";
-  }
-
-  std::string mean_axes;
-  std::string variance_axes;
-  if (begin_norm_axis > 1) {
-    mean_axes = "z";
-    variance_axes = "z";
-  } else {
-    mean_axes = "j";
-    variance_axes = "j";
-  }
-
-  std::string out_axes = x_axes;
-
-  VLOG(4) << "LayerNormSPMDRule build Einsum notation (x,scale,bias->out): ["
-          << x_axes << "," << scale_axes << "," << bias_axes << " --> "
-          << out_axes << "," << mean_axes << "," << variance_axes
-          << "](begin_norm_axis:" << begin_norm_axis
-          << ",y=" << x_axes.substr(begin_norm_axis, x_ndim - begin_norm_axis)
-          << ",z=" << x_axes.substr(0, begin_norm_axis) << ").";
-
-  // step2: Sharding Propogation
-  TensorDistAttr output_dist_attr_dst =
-      CopyTensorDistAttrForOutput(x_dist_attr_src);
-  TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
-  TensorDistAttr mean_dist_attr_dst =
-      CopyTensorDistAttrForOutput(x_dist_attr_src);
-  TensorDistAttr varience_dist_attr_dst =
-      CopyTensorDistAttrForOutput(x_dist_attr_src);
-  std::vector<int64_t> out_dims_mapping;
-  out_dims_mapping.reserve(out_axes.size());
-
-  int64_t mean_shard_dim = -1;
-  // As the mean and variance in outputs are `flattened` from
-  // x[0:begin_norm_axis], only the first axis can be sharded,
-  // the axes 1 to begin_norm_axis-1 are set to be replicated.
-  std::vector<int64_t> x_dims_mapping_dst(x_ndim, -1);
-  x_dims_mapping_dst[0] = x_dims_mapping[0];
-  for (int i = 0; i < x_ndim; ++i) {
-    if (i < begin_norm_axis) {
-      out_dims_mapping.push_back(x_dims_mapping_dst[i]);
-      // if ijk,k,k->ijk,z,z (x,scale,bias->out,mean,variance,
-      // begin_norm_axis=2, z=ij), and the dims_mapping of input is (0,1,-1),
-      // the mean and varience is sharded by dim 0 and 1,
-      // which is not supported currently.
-      mean_shard_dim = ShardingMergeForAxis(
-          mean_axes, mean_shard_dim, x_dims_mapping_dst[i]);
-    } else {
-      out_dims_mapping.push_back(-1);
-    }
-  }
-  output_dist_attr_dst.set_dims_mapping(out_dims_mapping);
-  mean_dist_attr_dst.set_dims_mapping({mean_shard_dim});
-  varience_dist_attr_dst.set_dims_mapping({mean_shard_dim});
-
-  // step2.3: Merge and get Inputs' New Dims Mapping.
-  x_dist_attr_dst.set_dims_mapping(x_dims_mapping_dst);
-  input_dist_attrs.emplace_back(x_dist_attr_dst);
-  // TODO(zhiqiu): support shardding on scale and bias
-  // Now, apply replicating.
-  input_dist_attrs.emplace_back(ReplicatedOnMesh(input_specs[1].dist_attr()));
-  input_dist_attrs.emplace_back(ReplicatedOnMesh(input_specs[2].dist_attr()));
-
-  // Step2.4.  handle input and out tensor partial
-  // LayerNorm not support
-
-  VLOG(4) << "LayerNormSPMDRule InferForward: "
-          << "X shape: [" << str_join(x_shape) << "], src_dims_mapping: ["
-          << str_join(x_dims_mapping) << "], dst_dims_mapping: ["
-          << str_join(x_dist_attr_dst.dims_mapping()) << "]; scale shape: ["
-          << str_join(scale_shape) << "], src_dims_mapping: ["
-          << str_join(scale_dims_mapping) << "], dst_dims_mapping: ["
-          << str_join(input_dist_attrs[1].dims_mapping()) << "]; bias shape: ["
-          << str_join(bias_shape) << "], src_dims_mapping: ["
-          << str_join(bias_dims_mapping) << "], dst_dims_mapping: ["
-          << str_join(input_dist_attrs[2].dims_mapping())
-          << "]; out dims_mapping: [" << str_join(out_dims_mapping)
-          << "]; mean dims_mapping: [" << mean_shard_dim
-          << "]; varience dims_mapping: [" << mean_shard_dim
-          << "], partial_on_dims: []";
-
-  return {input_dist_attrs,
-          {output_dist_attr_dst, mean_dist_attr_dst, varience_dist_attr_dst}};
-}
-
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-LayerNormSPMDRule::InferBackward(
-    const std::vector<DistTensorSpec>& input_specs,
-    const std::vector<DistTensorSpec>& output_specs,
-    const paddle::framework::AttributeMap& attrs) {
-  // step0: verify input args based on layer_norm logic
-  int64_t ninputs = input_specs.size();
-  int64_t noutputs = output_specs.size();
-  PADDLE_ENFORCE_EQ(
-      ninputs,
-      3,
-      phi::errors::InvalidArgument(
-          "The size of InputSpec of layer_norm should be 3, but got [%d].",
-          ninputs));
-  PADDLE_ENFORCE_EQ(
-      noutputs,
-      3,
-      phi::errors::InvalidArgument(
-          "The size of InputSpec of layer_norm should be 3, but got [%d].",
-          noutputs));
-  VerifySpecs(output_specs, "layer_norm_backward");
-
-  // step1: build Einsum Notation
-  // ijk,k,k->ijk,z,z (x,scale,bias->out,mean,variance, begin_norm_axis=2, z=ij)
-  // ijkl,y(kl),y(kl)->ijkl,z(ij),z(ij) (x,scale,bias->out,mean,variance,
-  // begin_norm_axis=2, z=ij, y=kl)
-  int begin_norm_axis = ExtractAttr<int>("begin_norm_axis", attrs);
-  std::string alphabet = "ijklmnopqrstuvwxyz";
-  int x_ndim = input_specs[0].shape().size();
-  std::string x_axes = alphabet.substr(0, x_ndim);
-  // the axes after norm_axis should be replicated,
-  // so set their notation to '1'.
-  for (int i = 1; i < x_ndim; i++) {
-    x_axes[i] = '1';
-  }
-  std::string out_axes = x_axes;
-  std::string mean_axes(1, '1'), varience_axes(1, '1');
-  if (begin_norm_axis > 0) {
-    mean_axes[0] = out_axes[0];
-    varience_axes[0] = out_axes[0];
-  }
-  std::vector<std::string> output_axes_vec;
-  output_axes_vec.emplace_back(out_axes);
-  output_axes_vec.emplace_back(mean_axes);
-  output_axes_vec.emplace_back(varience_axes);
-
-  // step2: Sharding Propogation
-  // For the axes after norm_axis in both input and output tensors,
-  // set their dims mappings to -1. For the other axes, set input
-  // tensor's dims mapping the same as output tensor's dims mapping.
-  // step2.1 merge dims mappings of output, mean, variance.
-  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
-  axes_sharding_info = GetAxesDimsMappingPair(output_axes_vec, output_specs);
-  std::unordered_map<std::string, int64_t> axis_to_dim_map =
-      ShardingMergeForTensors(axes_sharding_info);
-
-  // step2.2 infer input dims mapping
-  std::vector<int64_t> input_dims_mapping =
-      GetDimsMappingForAxes(x_axes, axis_to_dim_map);
-  std::vector<TensorDistAttr> input_dist_attrs;
-  for (int64_t i = 0; i < ninputs; i++) {
-    input_dist_attrs.emplace_back(input_specs[i].dist_attr());
-  }
-  input_dist_attrs[0].set_dims_mapping(input_dims_mapping);
-  // set bias and scale to be replicated
-  input_dist_attrs[1].set_dims_mapping({-1});
-  input_dist_attrs[2].set_dims_mapping({-1});
-
-  // step2.3 update output dims mappings with merged one
-  std::vector<TensorDistAttr> output_dist_attrs;
-  for (int64_t i = 0; i < noutputs; i++) {
-    output_dist_attrs.emplace_back(output_specs[i].dist_attr());
-    output_dist_attrs[i].set_dims_mapping(
-        GetDimsMappingForAxes(output_axes_vec[i], axis_to_dim_map));
-  }
-
-  VLOG(4) << "LayerNormSPMDRule InferBackward:";
-  VLOG(4) << "begin_norm_axis: " << begin_norm_axis;
-  for (int64_t i = 0; i < noutputs; i++) {
-    VLOG(4) << "Output" << std::to_string(i) << " shape: ["
-            << str_join(output_specs[i].shape()) << "] "
-            << "Einsum Notation: " << output_axes_vec[i]
-            << " src_dims_mapping: ["
-            << str_join(output_specs[i].dims_mapping()) << "] "
-            << "dst_dims_mapping: ["
-            << str_join(output_dist_attrs[i].dims_mapping()) << "]";
-  }
-
-  for (int64_t i = 0; i < ninputs; i++) {
-    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
-            << str_join(input_specs[i].shape()) << "] "
-            << "Einsum Notation: " << std::string(i == 0 ? x_axes : "1")
-            << " dims_mapping: ["
-            << str_join(input_dist_attrs[i].dims_mapping()) << "]";
-  }
-  VLOG(4) << std::endl;
-
-  return {input_dist_attrs, output_dist_attrs};
-}
-
-}  // namespace auto_parallel
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h
deleted file mode 100644
index da40f3da5653f..0000000000000
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iterator>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
-
-namespace paddle {
-namespace distributed {
-namespace auto_parallel {
-
-class LayerNormSPMDRule : public SPMDRuleBase {
- public:
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferForward(const std::vector<DistTensorSpec>& input_specs,
-               const paddle::framework::AttributeMap& attrs) override;
-
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferBackward(const std::vector<DistTensorSpec>& input_specs,
-                const std::vector<DistTensorSpec>& output_specs,
-                const paddle::framework::AttributeMap& attrs) override;
-};
-}  // namespace auto_parallel
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
index 54ae4325b8a15..71f939ffd3785 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -17,7 +17,6 @@
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h"
@@ -29,9 +28,6 @@ namespace paddle {
 namespace distributed {
 namespace auto_parallel {
 
-// layer_norm rule
-REGISTER_SPMD_RULE(layer_norm, LayerNormSPMDRule);
-
 // replicated rule
 REGISTER_SPMD_RULE(replicated, ReplicatedSPMDRule);
 
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
index 6e0c0f696fef4..24030b5d0ffa8 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
@@ -53,6 +53,9 @@ AttrType InferSpmdContext::AttrAt(size_t idx) const {
   }
 }
 
+template float InferSpmdContext::AttrAt(size_t idx) const;
+template int InferSpmdContext::AttrAt(size_t idx) const;
+
 template <>
 bool InferSpmdContext::AttrAt(size_t idx) const {
   try {
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
index 23b147a4bb3d7..499c2340983a7 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
@@ -153,6 +153,8 @@ struct InferSpmdFnImpl<Return (*)(Args...), infer_spmd_fn> {
 
   // TODO(chenweihang): support other attr type later as needed
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(float);
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(std::vector<int>);
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(
       std::vector<int64_t>);
diff --git a/paddle/phi/infermeta/spmd_rules/layer_norm.cc b/paddle/phi/infermeta/spmd_rules/layer_norm.cc
new file mode 100644
index 0000000000000..6befef19cfef1
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/layer_norm.cc
@@ -0,0 +1,282 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/layer_norm.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+
+using phi::distributed::auto_parallel::str_join;
+
+SpmdInfo LayerNormInferSpmd(const DistMetaTensor& x,
+                            const DistMetaTensor& scale,
+                            const DistMetaTensor& bias,
+                            float epsilon,
+                            int begin_norm_axis) {
+  // Step0: verify input args based on layer_norm logic
+  auto x_shape = phi::vectorize(x.dims());
+  auto scale_shape = phi::vectorize(scale.dims());
+  auto bias_shape = phi::vectorize(bias.dims());
+  int x_ndim = x_shape.size();
+  int scale_ndim = scale_shape.size();
+  int bias_ndim = bias_shape.size();
+  TensorDistAttr x_dist_attr_src = x.dist_attr();
+  std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
+  std::vector<int64_t> scale_dims_mapping = scale.dist_attr().dims_mapping();
+  std::vector<int64_t> bias_dims_mapping = bias.dist_attr().dims_mapping();
+
+  PADDLE_ENFORCE_EQ(
+      scale_ndim,
+      1,
+      phi::errors::InvalidArgument(
+          "The ndim of scale in layer_norm should be 1, but got [%d].",
+          scale_ndim));
+
+  PADDLE_ENFORCE_EQ(
+      bias_ndim,
+      1,
+      phi::errors::InvalidArgument(
+          "The ndim of bias in layer_norm should be 1, but got [%d].",
+          bias_ndim));
+
+  // Step1: Build Einsum Notation
+  // ijk,k,k->ijk,z,z (x,scale,bias->out,mean,variance, begin_norm_axis=2, z=ij)
+  // ijkl,y(kl),y(kl)->ijkl,z(ij),z(ij) (x,scale,bias->out,mean,variance,
+  // begin_norm_axis=2, z=ij, y=kl)
+  std::string alphabet = "ijklmnopqrstuvwxyz";
+  // get input notation
+  // Because the mean and variance is 'flattened' from
+  // x[0:begin_norm_axis], only the first axis of x can
+  // be sharded
+  std::string x_axes(x_ndim, '1');
+  x_axes[0] = alphabet[0];
+  std::string scale_axes(1, x_axes[x_ndim - 1]);
+  std::string bias_axes(1, x_axes[x_ndim - 1]);
+
+  // get output notation
+  std::string out_axes = x_axes;
+  std::string mean_axes(1, '1'), variance_axes(1, '1');
+  if (begin_norm_axis > 0) {
+    mean_axes[0] = out_axes[0];
+    variance_axes[0] = out_axes[0];
+  }
+
+  // Step2: Sharding Propogation
+  // Step2.1: merge input sharding
+  // As the mean and variance in outputs are `flattened` from
+  // x[0:begin_norm_axis], only the first axis can be sharded,
+  // the axes 1 to begin_norm_axis-1 are set to be replicated.
+  std::fill(x_dims_mapping.begin() + 1, x_dims_mapping.end(), -1);
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors({{x_axes, x_dims_mapping}});
+
+  // Step2.2: infer output dims mapping
+  TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  TensorDistAttr mean_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  TensorDistAttr varience_dist_attr =
+      CopyTensorDistAttrForOutput(x_dist_attr_src);
+  out_dist_attr.set_dims_mapping(
+      GetDimsMappingForAxes(out_axes, axis_to_dim_map));
+  mean_dist_attr.set_dims_mapping(
+      GetDimsMappingForAxes(mean_axes, axis_to_dim_map));
+  varience_dist_attr.set_dims_mapping(
+      GetDimsMappingForAxes(variance_axes, axis_to_dim_map));
+
+  // Step2.3: update input dims mapping
+  TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  TensorDistAttr scale_dist_attr_dst =
+      CopyTensorDistAttrForOutput(scale.dist_attr());
+  TensorDistAttr bias_dist_attr_dst =
+      CopyTensorDistAttrForOutput(bias.dist_attr());
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping);
+  // TODO(zhiqiu): support shardding on scale and bias
+  // Now, apply replicating.
+  scale_dist_attr_dst.set_dims_mapping({-1});
+  bias_dist_attr_dst.set_dims_mapping({-1});
+
+  // Step2.4.  handle input and out tensor partial
+  // LayerNorm not support
+  VLOG(4) << "LayerNormInferSpmd:";
+  VLOG(4) << "begin_norm_axis: " << begin_norm_axis;
+  VLOG(4) << "Einsum Notation: " << x_axes << "," << scale_axes << ","
+          << bias_axes << "-->" << out_axes << "," << mean_axes << ","
+          << variance_axes;
+  VLOG(4) << "X"
+          << " shape: [" << str_join(x_shape) << "] "
+          << "src_dims_mapping: [" << str_join(x_dist_attr_src.dims_mapping())
+          << "] "
+          << "dst_dims_mapping: [" << str_join(x_dims_mapping) << "]";
+  VLOG(4) << "Scale"
+          << " shape: [" << str_join(scale_shape) << "] "
+          << "src_dims_mapping: [" << str_join(scale_dims_mapping) << "] "
+          << "dst_dims_mapping: ["
+          << str_join(scale_dist_attr_dst.dims_mapping()) << "]";
+  VLOG(4) << "Bias"
+          << " shape: [" << str_join(bias_shape) << "] "
+          << "src_dims_mapping: [" << str_join(bias_dims_mapping) << "] "
+          << "dst_dims_mapping: ["
+          << str_join(bias_dist_attr_dst.dims_mapping()) << "]";
+  VLOG(4) << "Out dims mapping: [" << str_join(out_dist_attr.dims_mapping())
+          << "]";
+  VLOG(4) << "Mean dims mapping: [" << str_join(mean_dist_attr.dims_mapping())
+          << "]";
+  VLOG(4) << "Variance dims mapping: ["
+          << str_join(varience_dist_attr.dims_mapping()) << "]";
+  VLOG(4) << std::endl;
+
+  return {{x_dist_attr_dst, scale_dist_attr_dst, bias_dist_attr_dst},
+          {out_dist_attr, mean_dist_attr, varience_dist_attr}};
+}
+
+SpmdInfo LayerNormInferSpmdReverse(const DistMetaTensor& x,
+                                   const DistMetaTensor& scale,
+                                   const DistMetaTensor& bias,
+                                   const DistMetaTensor& out,
+                                   const DistMetaTensor& mean,
+                                   const DistMetaTensor& variance,
+                                   float epsilon,
+                                   int begin_norm_axis) {
+  // Step0: Verify input args based on layer_norm logic
+  auto x_shape = phi::vectorize(x.dims());
+  auto out_shape = phi::vectorize(out.dims());
+  auto mean_shape = phi::vectorize(mean.dims());
+  auto variance_shape = phi::vectorize(variance.dims());
+  int x_ndim = x_shape.size();
+  int out_ndim = out_shape.size();
+  int mean_ndim = mean_shape.size();
+  int variance_ndim = variance_shape.size();
+  auto out_dist_attr_src = out.dist_attr();
+  auto mean_dist_attr_src = mean.dist_attr();
+  auto variance_dist_attr_src = variance.dist_attr();
+  std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
+  std::vector<int64_t> mean_dims_mapping = mean_dist_attr_src.dims_mapping();
+  std::vector<int64_t> variance_dims_mapping =
+      variance_dist_attr_src.dims_mapping();
+  PADDLE_ENFORCE_EQ(
+      out_ndim,
+      out_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor Out's rank [%d] and Out's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   out_ndim,
+                                   out_dims_mapping.size()));
+  PADDLE_ENFORCE_EQ(
+      mean_ndim,
+      mean_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor Mean's rank [%d] and Mean's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   mean_ndim,
+                                   mean_dims_mapping.size()));
+  PADDLE_ENFORCE_EQ(variance_ndim,
+                    variance_dims_mapping.size(),
+                    phi::errors::InvalidArgument(
+                        "The Tensor Variance's rank [%d] and Variance's "
+                        "dims_mapping size [%d] are not matched.",
+                        variance_ndim,
+                        variance_dims_mapping.size()));
+  // Step1: Build Einsum Notation
+  // ijk,k,k->ijk,z,z (x,scale,bias->out,mean,variance, begin_norm_axis=2, z=ij)
+  // ijkl,y(kl),y(kl)->ijkl,z(ij),z(ij) (x,scale,bias->out,mean,variance,
+  // begin_norm_axis=2, z=ij, y=kl)
+  std::string alphabet = "ijklmnopqrstuvwxyz";
+  // the axes after norm_axis should be replicated,
+  // so set their notation to '1'.
+  std::string x_axes(x_ndim, '1');
+  x_axes[0] = alphabet[0];
+  std::string scale_axes(1, x_axes[x_ndim - 1]);
+  std::string bias_axes(1, x_axes[x_ndim - 1]);
+
+  std::string out_axes = x_axes;
+  std::string mean_axes(1, '1'), variance_axes(1, '1');
+  if (begin_norm_axis > 0) {
+    mean_axes[0] = out_axes[0];
+    variance_axes[0] = out_axes[0];
+  }
+
+  // Step2: Sharding Propogation
+  // For the axes after norm_axis in both input and output tensors,
+  // set their dims mappings to -1. For the other axes, set input
+  // tensor's dims mapping the same as output tensor's dims mapping.
+  // step2.1 merge dims mappings of output, mean, variance.
+  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
+  axes_sharding_info.emplace_back(std::make_pair(out_axes, out_dims_mapping));
+  axes_sharding_info.emplace_back(std::make_pair(mean_axes, mean_dims_mapping));
+  axes_sharding_info.emplace_back(
+      std::make_pair(variance_axes, variance_dims_mapping));
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors(axes_sharding_info);
+
+  // Step2.2 infer input dims mapping
+  std::vector<int64_t> x_dims_mapping =
+      GetDimsMappingForAxes(x_axes, axis_to_dim_map);
+  std::vector<TensorDistAttr> input_dist_attrs;
+  input_dist_attrs.emplace_back(x.dist_attr());
+  input_dist_attrs.emplace_back(scale.dist_attr());
+  input_dist_attrs.emplace_back(bias.dist_attr());
+
+  input_dist_attrs[0].set_dims_mapping(x_dims_mapping);
+  // set bias and scale to be replicated
+  input_dist_attrs[1].set_dims_mapping({-1});
+  input_dist_attrs[2].set_dims_mapping({-1});
+
+  // Step2.3 Update output dims mappings with merged one
+  std::vector<TensorDistAttr> output_dist_attrs;
+  output_dist_attrs.emplace_back(out_dist_attr_src);
+  output_dist_attrs.emplace_back(mean_dist_attr_src);
+  output_dist_attrs.emplace_back(variance_dist_attr_src);
+  output_dist_attrs[0].set_dims_mapping(
+      GetDimsMappingForAxes(out_axes, axis_to_dim_map));
+  output_dist_attrs[1].set_dims_mapping(
+      GetDimsMappingForAxes(mean_axes, axis_to_dim_map));
+  output_dist_attrs[2].set_dims_mapping(
+      GetDimsMappingForAxes(variance_axes, axis_to_dim_map));
+
+  VLOG(4) << "LayerNormInferSpmdReverse:";
+  VLOG(4) << "begin_norm_axis: " << begin_norm_axis;
+  VLOG(4) << "Einsum Notation: " << x_axes << "," << scale_axes << ","
+          << bias_axes << "-->" << out_axes << "," << mean_axes << ","
+          << variance_axes;
+  VLOG(4) << "Out"
+          << " shape: [" << str_join(out_shape) << "] "
+          << " src_dims_mapping: [" << str_join(out_dims_mapping) << "] "
+          << "dst_dims_mapping: ["
+          << str_join(output_dist_attrs[0].dims_mapping()) << "]";
+  VLOG(4) << "Mean"
+          << " shape: [" << str_join(mean_shape) << "] "
+          << " src_dims_mapping: [" << str_join(mean_dims_mapping) << "] "
+          << "dst_dims_mapping: ["
+          << str_join(output_dist_attrs[1].dims_mapping()) << "]";
+  VLOG(4) << "Variance"
+          << " shape: [" << str_join(variance_shape) << "] "
+          << " src_dims_mapping: [" << str_join(variance_dims_mapping) << "] "
+          << "dst_dims_mapping: ["
+          << str_join(output_dist_attrs[2].dims_mapping()) << "]";
+
+  for (int i = 0, n = input_dist_attrs.size(); i < n; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " dims_mapping: ["
+            << str_join(input_dist_attrs[i].dims_mapping()) << "]";
+  }
+  VLOG(4) << std::endl;
+
+  return {input_dist_attrs, output_dist_attrs};
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/layer_norm.h b/paddle/phi/infermeta/spmd_rules/layer_norm.h
new file mode 100644
index 0000000000000..c33b58a51bc20
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/layer_norm.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo LayerNormInferSpmd(const DistMetaTensor& x,
+                            const DistMetaTensor& scale,
+                            const DistMetaTensor& bias,
+                            float epsilon,
+                            int begin_norm_axis);
+
+SpmdInfo LayerNormInferSpmdReverse(const DistMetaTensor& x,
+                                   const DistMetaTensor& scale,
+                                   const DistMetaTensor& bias,
+                                   const DistMetaTensor& out,
+                                   const DistMetaTensor& mean,
+                                   const DistMetaTensor& variance,
+                                   float epsilon,
+                                   int begin_norm_axis);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index 71a726e3d8edc..cb01b8996a8c9 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h"
 #include "paddle/phi/infermeta/spmd_rules/elementwise.h"
+#include "paddle/phi/infermeta/spmd_rules/layer_norm.h"
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
 #include "paddle/phi/infermeta/spmd_rules/reduction.h"
 #include "paddle/phi/infermeta/spmd_rules/replicated.h"
@@ -457,5 +458,11 @@ PD_REGISTER_SPMD_RULE(
     PD_INFER_SPMD(phi::distributed::ReductionInferSpmd),
     PD_INFER_SPMD(phi::distributed::ReductionInferSpmdReverse));
 
+// layer_norm
+PD_REGISTER_SPMD_RULE(
+    layer_norm,
+    PD_INFER_SPMD(phi::distributed::LayerNormInferSpmd),
+    PD_INFER_SPMD(phi::distributed::LayerNormInferSpmdReverse));
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/test/auto_parallel/spmd_rules/test_layer_norm_rule.py b/test/auto_parallel/spmd_rules/test_layer_norm_rule.py
index bac7d12f13b06..9af336fd8d214 100644
--- a/test/auto_parallel/spmd_rules/test_layer_norm_rule.py
+++ b/test/auto_parallel/spmd_rules/test_layer_norm_rule.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 import unittest
+from collections import OrderedDict
 
-from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
 from paddle.distributed.auto_parallel.static.dist_attribute import (
     DistTensorSpec,
     TensorDistAttr,
 )
 from paddle.distributed.fleet import auto
+from paddle.framework import core
 
 
 class TestLayerNormSPMDRule(unittest.TestCase):
@@ -28,7 +29,7 @@ class TestLayerNormSPMDRule(unittest.TestCase):
     """
 
     def setUp(self):
-        self.rule = get_spmd_rule("layer_norm")
+        self.rule = core.get_phi_spmd_rule("layer_norm")
 
         x_shape = [64, 32, 1024]
         scale_shape = [1024]
@@ -51,9 +52,7 @@ def setUp(self):
         self.mean_spec = DistTensorSpec(self.x_spec)
         self.var_spec = DistTensorSpec(self.x_spec)
 
-        self.attrs = {
-            'begin_norm_axis': 2,
-        }
+        self.attrs = OrderedDict([('epsilon', 1e-3), ('begin_norm_axis', 2)])
 
     def test_infer_forward(self):
         # ijk[1, -1, -1], k[-1], k[-1] -->
@@ -65,7 +64,11 @@ def test_infer_forward(self):
         self.scale_spec.set_dims_mapping([-1])
 
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_spec, self.scale_spec, self.bias_spec], self.attrs
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -90,7 +93,11 @@ def test_infer_forward(self):
         self.bias_spec.set_dims_mapping([0])
 
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_spec, self.scale_spec, self.bias_spec], self.attrs
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -119,7 +126,11 @@ def test_infer_forward(self):
         self.bias_spec.set_dims_mapping([1])
 
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_spec, self.scale_spec, self.bias_spec], self.attrs
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -156,9 +167,14 @@ def test_infer_backward(self):
         self.var_spec.set_dims_mapping([1])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_spec, self.scale_spec, self.bias_spec],
-            [self.out_spec, self.mean_spec, self.var_spec],
-            self.attrs,
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.out_spec,
+            self.mean_spec,
+            self.var_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -196,9 +212,14 @@ def test_infer_backward(self):
         self.var_spec.set_dims_mapping([0])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_spec, self.scale_spec, self.bias_spec],
-            [self.out_spec, self.mean_spec, self.var_spec],
-            self.attrs,
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.out_spec,
+            self.mean_spec,
+            self.var_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -236,9 +257,14 @@ def test_infer_backward(self):
         self.var_spec.set_dims_mapping([-1])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_spec, self.scale_spec, self.bias_spec],
-            [self.out_spec, self.mean_spec, self.var_spec],
-            self.attrs,
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.out_spec,
+            self.mean_spec,
+            self.var_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -276,9 +302,14 @@ def test_infer_backward(self):
         self.var_spec.set_dims_mapping([-1])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_spec, self.scale_spec, self.bias_spec],
-            [self.out_spec, self.mean_spec, self.var_spec],
-            self.attrs,
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.out_spec,
+            self.mean_spec,
+            self.var_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -315,11 +346,16 @@ def test_infer_backward(self):
         self.mean_spec.set_dims_mapping([0])
         self.var_spec.set_dims_mapping([-1])
 
-        with self.assertRaises(BaseException):
+        with self.assertRaises(NotImplementedError):
             result_dist_attrs = self.rule.infer_backward(
-                [self.x_spec, self.scale_spec, self.bias_spec],
-                [self.out_spec, self.mean_spec, self.var_spec],
-                self.attrs,
+                self.x_spec,
+                self.scale_spec,
+                self.bias_spec,
+                self.out_spec,
+                self.mean_spec,
+                self.var_spec,
+                self.attrs['epsilon'],
+                self.attrs['begin_norm_axis'],
             )
 
         # [-1, 1, -1], [0], [-1] (outputs) -->
@@ -344,9 +380,14 @@ def test_infer_backward(self):
         self.var_spec.set_dims_mapping([-1])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_spec, self.scale_spec, self.bias_spec],
-            [self.out_spec, self.mean_spec, self.var_spec],
-            self.attrs,
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.out_spec,
+            self.mean_spec,
+            self.var_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -384,9 +425,14 @@ def test_infer_backward(self):
         self.var_spec.set_dims_mapping([-1])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_spec, self.scale_spec, self.bias_spec],
-            [self.out_spec, self.mean_spec, self.var_spec],
-            self.attrs,
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.out_spec,
+            self.mean_spec,
+            self.var_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -424,9 +470,14 @@ def test_infer_backward(self):
         self.var_spec.set_dims_mapping([-1])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_spec, self.scale_spec, self.bias_spec],
-            [self.out_spec, self.mean_spec, self.var_spec],
-            self.attrs,
+            self.x_spec,
+            self.scale_spec,
+            self.bias_spec,
+            self.out_spec,
+            self.mean_spec,
+            self.var_spec,
+            self.attrs['epsilon'],
+            self.attrs['begin_norm_axis'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index e8f74513fc96f..42476d7bb323f 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -308,23 +308,28 @@ TEST(LayerNormSPMDRule, Ctor) {
   bias_dist_attr.set_dims_mapping(std::vector<int64_t>({-1}));
   bias_dist_attr.set_dynamic_dims(std::vector<bool>({false}));
 
-  DistTensorSpec x_dist_tensor_spec = DistTensorSpec(x_shape, x_dist_attr);
-  DistTensorSpec scale_dist_tensor_spec =
-      DistTensorSpec(scale_shape, scale_dist_attr);
-  DistTensorSpec bias_dist_tensor_spec =
-      DistTensorSpec(bias_shape, bias_dist_attr);
-
   paddle::framework::AttributeMap attrs;
-  attrs["begin_norm_axis"] = 2;
+  float epsilon = 1e-5;
+  int begin_norm_axis = 2;
 
-  SPMDRuleBase* layer_norm_rule = SPMDRuleMap::Instance().Get("layer_norm");
+  auto layer_norm_rule =
+      phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule("layer_norm");
 
   // ijk[1, -1, -1], k[-1], k[-1] --> ijk[1, -1, -1], z[1], z[1], z=ij,
   // begin_norm_axis=2
+  begin_norm_axis = 2;
+  x_dist_attr.set_dims_mapping({1, -1, -1});
+  scale_dist_attr.set_dims_mapping({-1});
+  bias_dist_attr.set_dims_mapping({-1});
+  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor scale(phi::make_ddim(scale_shape),
+                                         scale_dist_attr);
+  phi::distributed::DistMetaTensor bias(phi::make_ddim(bias_shape),
+                                        bias_dist_attr);
+  phi::distributed::InferSpmdContext ctx({x, scale, bias},
+                                         {epsilon, begin_norm_axis});
   std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-      infered_dist_attrs = layer_norm_rule->InferForward(
-          {x_dist_tensor_spec, scale_dist_tensor_spec, bias_dist_tensor_spec},
-          attrs);
+      infered_dist_attrs = layer_norm_rule.InferForward(ctx);
 
   size_t input_size = 3;
   size_t output_size = 3;
@@ -347,12 +352,18 @@ TEST(LayerNormSPMDRule, Ctor) {
 
   // ijk[1, 0, -1],k[0],k[0] --> ijk[1, -1, -1],z[1],z[1],
   // begin_norm_axis=2
-  x_dist_tensor_spec.set_dims_mapping({1, 0, -1});
-  scale_dist_tensor_spec.set_dims_mapping({0});
-  bias_dist_tensor_spec.set_dims_mapping({0});
-  infered_dist_attrs = layer_norm_rule->InferForward(
-      {x_dist_tensor_spec, scale_dist_tensor_spec, bias_dist_tensor_spec},
-      attrs);
+  begin_norm_axis = 2;
+  x_dist_attr.set_dims_mapping({1, 0, -1});
+  scale_dist_attr.set_dims_mapping({0});
+  bias_dist_attr.set_dims_mapping({0});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  scale = phi::distributed::DistMetaTensor(phi::make_ddim(scale_shape),
+                                           scale_dist_attr);
+  bias = phi::distributed::DistMetaTensor(phi::make_ddim(bias_shape),
+                                          bias_dist_attr);
+  ctx = phi::distributed::InferSpmdContext({x, scale, bias},
+                                           {epsilon, begin_norm_axis});
+  infered_dist_attrs = layer_norm_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({1, -1, -1}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
@@ -369,13 +380,18 @@ TEST(LayerNormSPMDRule, Ctor) {
 
   // ijk[0, -1, -1],y[-1],y[1] --> ijk[0, 1, -1], i[0], i[0], y=jk,
   // begin_norm_axis=1
-  x_dist_tensor_spec.set_dims_mapping({0, -1, -1});
-  scale_dist_tensor_spec.set_dims_mapping({-1});
-  bias_dist_tensor_spec.set_dims_mapping({1});
-  attrs["begin_norm_axis"] = 1;
-  infered_dist_attrs = layer_norm_rule->InferForward(
-      {x_dist_tensor_spec, scale_dist_tensor_spec, bias_dist_tensor_spec},
-      attrs);
+  begin_norm_axis = 1;
+  x_dist_attr.set_dims_mapping({0, -1, -1});
+  scale_dist_attr.set_dims_mapping({-1});
+  bias_dist_attr.set_dims_mapping({1});
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  scale = phi::distributed::DistMetaTensor(phi::make_ddim(scale_shape),
+                                           scale_dist_attr);
+  bias = phi::distributed::DistMetaTensor(phi::make_ddim(bias_shape),
+                                          bias_dist_attr);
+  ctx = phi::distributed::InferSpmdContext({x, scale, bias},
+                                           {epsilon, begin_norm_axis});
+  infered_dist_attrs = layer_norm_rule.InferForward(ctx);
   EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
             std::vector<int64_t>({0, -1, -1}));
   EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),

From 87738f1210d07f47c77bf5da68b803477a3dde0a Mon Sep 17 00:00:00 2001
From: GGBond8488 <33050871+GGBond8488@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:14:48 +0800
Subject: [PATCH 057/115] add inplace api transpose_, t_, normal_,cauchy_,
 geometric_ (#57093)

* add inplace

* fix transpose inpalce error

* fix error

* fix

* fix

* add gaussian inpalce kernel

* change cauchy_ gepmetric impl

* fix typro

* add test

* remove gaussian test

* fix sample code error

* fix sample code

* fix sample code error
---
 paddle/phi/api/yaml/backward.yaml             |  11 ++
 paddle/phi/api/yaml/generator/api_base.py     |   1 +
 paddle/phi/api/yaml/legacy_ops.yaml           |   1 +
 paddle/phi/api/yaml/ops.yaml                  |  13 ++
 .../cpu/gaussian_inplace_grad_kernel.cc       |  41 +++++
 paddle/phi/kernels/cpu/gaussian_kernel.cc     |  31 ++++
 .../kernels/gaussian_inplace_grad_kernel.h    |  29 ++++
 paddle/phi/kernels/gaussian_kernel.h          |   8 +
 .../gpu/gaussian_inplace_grad_kernel.cu       |  44 +++++
 paddle/phi/kernels/gpu/gaussian_kernel.cu     |  32 ++++
 paddle/phi/kernels/stride/transpose_kernel.cc |   2 -
 python/paddle/__init__.py                     |  10 ++
 python/paddle/tensor/__init__.py              |  13 ++
 python/paddle/tensor/creation.py              |  69 ++++++++
 python/paddle/tensor/linalg.py                |  32 ++++
 python/paddle/tensor/random.py                |  73 ++++++++
 test/legacy_test/test_cauchy_inplace.py       | 139 ++++++++++++++++
 test/legacy_test/test_geometric_inplace.py    | 143 ++++++++++++++++
 test/legacy_test/test_inplace.py              |  48 ++++++
 test/legacy_test/test_normal_inplace.py       | 156 ++++++++++++++++++
 20 files changed, 894 insertions(+), 2 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/gaussian_inplace_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/gaussian_inplace_grad_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
 create mode 100644 test/legacy_test/test_cauchy_inplace.py
 create mode 100644 test/legacy_test/test_geometric_inplace.py
 create mode 100644 test/legacy_test/test_normal_inplace.py

diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 2f48bb80478e6..66f5056320950 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -939,6 +939,17 @@
   composite : gather_nd_grad(x, index, out_grad, x_grad)
   no_need_buffer : x
 
+- backward_op : gaussian_inplace_grad
+  forward : gaussian_inplace(Tensor x, float mean=0, float std=1.0, int seed=0) -> Tensor(out)
+  args : (Tensor out_grad, float mean=0, float std=1.0, int seed=0)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
+  kernel :
+    func : gaussian_inplace_grad
+  inplace : (out_grad -> x_grad)
+
 - backward_op : gelu_grad
   forward : gelu(Tensor x,  bool approximate) -> Tensor(out)
   args : (Tensor x, Tensor out_grad,  bool approximate)
diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py
index cbf4ed1dab837..5e7cff9213171 100644
--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -1223,6 +1223,7 @@ def gen_kernel_code(self, kernel_name, code_indent, inplace_flag=False):
             "unsqueeze",
             "reshape",
             "flatten",
+            "transpose",
         ]:
             i = 0
             for kernel_out in outputs_args:
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index a647e02b35ef2..14daf99fd7f13 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1065,6 +1065,7 @@
     func : TransposeInferMeta
   kernel :
     func : transpose
+  inplace : (x -> out)
   backward : transpose_grad
 
 - op : tril
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index c93f94c2b3320..fdada46699d26 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1052,6 +1052,19 @@
     func : gather_tree
     data_type : ids
 
+- op : gaussian_inplace
+  args: (Tensor x, float mean=0, float std=1.0, int seed=0)
+  output: Tensor(out)
+  infer_meta:
+    func: UnchangedInferMeta
+    param: [x]
+  kernel:
+    func: gaussian_inplace
+    data_type: x
+    backend : x
+  inplace: (x -> out)
+  backward: gaussian_inplace_grad
+
 - op : gelu
   args : (Tensor x,  bool approximate = false)
   output : Tensor(out)
diff --git a/paddle/phi/kernels/cpu/gaussian_inplace_grad_kernel.cc b/paddle/phi/kernels/cpu/gaussian_inplace_grad_kernel.cc
new file mode 100644
index 0000000000000..5913e5b1a4e56
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gaussian_inplace_grad_kernel.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/gaussian_inplace_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianInplaceGradKernel(const Context& ctx,
+                               const DenseTensor& out_grad UNUSED,
+                               float mean UNUSED,
+                               float std UNUSED,
+                               int seed UNUSED,
+                               DenseTensor* x_grad) {
+  if (x_grad) {
+    auto* data = ctx.template Alloc<T>(x_grad);
+    std::fill(data, data + x_grad->numel(), T(0));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_inplace_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GaussianInplaceGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/gaussian_kernel.cc b/paddle/phi/kernels/cpu/gaussian_kernel.cc
index 2eb783c695b65..00ed6aaf35740 100644
--- a/paddle/phi/kernels/cpu/gaussian_kernel.cc
+++ b/paddle/phi/kernels/cpu/gaussian_kernel.cc
@@ -48,7 +48,38 @@ void GaussianKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void GaussianInplaceKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           float mean,
+                           float std,
+                           int seed,
+                           DenseTensor* out) {
+  T* data = dev_ctx.template Alloc<T>(out);
+  std::normal_distribution<T> dist(mean, std);
+
+  int64_t size = out->numel();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
     gaussian, CPU, ALL_LAYOUT, phi::GaussianKernel, float, double) {}
+
+PD_REGISTER_KERNEL(gaussian_inplace,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GaussianInplaceKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gaussian_inplace_grad_kernel.h b/paddle/phi/kernels/gaussian_inplace_grad_kernel.h
new file mode 100644
index 0000000000000..447b7199f695e
--- /dev/null
+++ b/paddle/phi/kernels/gaussian_inplace_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianInplaceGradKernel(const Context& ctx,
+                               const DenseTensor& out_grad,
+                               float mean,
+                               float std,
+                               int seed,
+                               DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gaussian_kernel.h b/paddle/phi/kernels/gaussian_kernel.h
index a04c8802cf385..5c24d9eb6eb14 100644
--- a/paddle/phi/kernels/gaussian_kernel.h
+++ b/paddle/phi/kernels/gaussian_kernel.h
@@ -29,4 +29,12 @@ void GaussianKernel(const Context& ctx,
                     DataType dtype,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void GaussianInplaceKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           float mean,
+                           float std,
+                           int seed,
+                           DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu b/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
new file mode 100644
index 0000000000000..d2bb9c31fa67d
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/gaussian_inplace_grad_kernel.h"
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianInplaceGradKernel(const Context& ctx,
+                               const DenseTensor& out_grad,
+                               float mean,
+                               float std,
+                               int seed,
+                               DenseTensor* x_grad) {
+  auto dims = vectorize(x_grad->dims());
+  float value = static_cast<float>(0.0f);
+  phi::FullKernel<T>(ctx, dims, value, phi::DataType::UNDEFINED, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_inplace_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GaussianInplaceGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gaussian_kernel.cu b/paddle/phi/kernels/gpu/gaussian_kernel.cu
index d0f839bd677d4..6e5c7ee63ce53 100644
--- a/paddle/phi/kernels/gpu/gaussian_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_kernel.cu
@@ -76,6 +76,29 @@ void GaussianKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void GaussianInpalceKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           float mean,
+                           float std,
+                           int seed,
+                           DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  if (seed == 0) {
+    // use global Generator seed
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    funcs::normal_distribution<MT> dist;
+    funcs::normal_transform<MT> trans(static_cast<MT>(mean),
+                                      static_cast<MT>(std));
+    funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
+  } else {
+    // use OP seed
+    auto func =
+        GaussianGenerator<T>(static_cast<T>(mean), static_cast<T>(std), seed);
+    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, out, func);
+  }
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(gaussian,
@@ -86,3 +109,12 @@ PD_REGISTER_KERNEL(gaussian,
                    phi::dtype::bfloat16,
                    float,
                    double) {}
+
+PD_REGISTER_KERNEL(gaussian_inpalce,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GaussianInpalceKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/stride/transpose_kernel.cc b/paddle/phi/kernels/stride/transpose_kernel.cc
index 748beb5194d4a..1fedb515ef020 100644
--- a/paddle/phi/kernels/stride/transpose_kernel.cc
+++ b/paddle/phi/kernels/stride/transpose_kernel.cc
@@ -33,11 +33,9 @@ void TransposeStridedKernel(const Context& ctx,
 
   auto meta = out->meta();
   auto in_stride = x.strides();
-  auto in_dims = x.dims();
   meta.strides = in_stride;
   for (int i = 0; i < static_cast<int>(formated_axis.size()); i++) {
     meta.strides[i] = in_stride[formated_axis[i]];
-    meta.dims[i] = in_dims[formated_axis[i]];
   }
   meta.offset = x.offset();
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 2fe1eecf21ff8..0c168b44c4a64 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -122,12 +122,16 @@
 from .tensor.creation import tril_indices  # noqa: F401
 from .tensor.creation import triu_indices  # noqa: F401
 from .tensor.creation import polar  # noqa: F401
+from .tensor.creation import geometric_  # noqa: F401
+from .tensor.creation import cauchy_  # noqa: F401
 from .tensor.linalg import matmul  # noqa: F401
 from .tensor.linalg import dot  # noqa: F401
 from .tensor.linalg import norm  # noqa: F401
 from .tensor.linalg import transpose  # noqa: F401
+from .tensor.linalg import transpose_  # noqa: F401
 from .tensor.linalg import dist  # noqa: F401
 from .tensor.linalg import t  # noqa: F401
+from .tensor.linalg import t_  # noqa: F401
 from .tensor.linalg import cdist  # noqa: F401
 from .tensor.linalg import cross  # noqa: F401
 from .tensor.linalg import cholesky  # noqa: F401
@@ -381,6 +385,7 @@
 from .tensor.random import multinomial  # noqa: F401
 from .tensor.random import standard_normal  # noqa: F401
 from .tensor.random import normal  # noqa: F401
+from .tensor.random import normal_  # noqa: F401
 from .tensor.random import uniform  # noqa: F401
 from .tensor.random import randn  # noqa: F401
 from .tensor.random import rand  # noqa: F401
@@ -505,6 +510,7 @@
     'allclose',
     'isclose',
     't',
+    't_',
     'add',
     'subtract',
     'diag',
@@ -556,6 +562,7 @@
     'any',
     'slice',
     'normal',
+    'normal_',
     'logsumexp',
     'full',
     'unsqueeze',
@@ -736,6 +743,9 @@
     'tanh',
     'tanh_',
     'transpose',
+    'transpose_',
+    'cauchy_',
+    'geometric_',
     'randn',
     'strided_slice',
     'unique',
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 841925f8b7ff8..b728392b0452d 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -43,6 +43,8 @@
 from .creation import empty_like  # noqa: F401
 from .creation import complex  # noqa: F401
 from .creation import polar  # noqa: F401
+from .creation import cauchy_  # noqa: F401
+from .creation import geometric_  # noqa: F401
 from .linalg import matmul  # noqa: F401
 from .linalg import dot  # noqa: F401
 from .linalg import cov  # noqa: F401
@@ -51,9 +53,11 @@
 from .linalg import pca_lowrank  # noqa: F401
 from .linalg import cond  # noqa: F401
 from .linalg import transpose  # noqa: F401
+from .linalg import transpose_  # noqa: F401
 from .linalg import lstsq  # noqa: F401
 from .linalg import dist  # noqa: F401
 from .linalg import t  # noqa: F401
+from .linalg import t_  # noqa: F401
 from .linalg import cross  # noqa: F401
 from .linalg import cholesky  # noqa: F401
 from .linalg import bmm  # noqa: F401
@@ -327,6 +331,7 @@
 from .random import multinomial  # noqa: F401
 from .random import standard_normal  # noqa: F401
 from .random import normal  # noqa: F401
+from .random import normal_  # noqa: F401
 from .random import uniform  # noqa: F401
 from .random import uniform_  # noqa: F401
 from .random import randn  # noqa: F401
@@ -381,9 +386,12 @@
     'norm',
     'cond',
     'transpose',
+    'cauchy_',
+    'geometric_',
     'lstsq',
     'dist',
     't',
+    't_',
     'cross',
     'cholesky',
     'bmm',
@@ -558,6 +566,10 @@
     'stack',
     'strided_slice',
     'transpose',
+    'transpose_',
+    'cauchy_',
+    'geometric_',
+    'tan_',
     'unique',
     'unique_consecutive',
     'unsqueeze',
@@ -673,6 +685,7 @@
     'i1e',
     'polygamma',
     'polygamma_',
+    'normal_',
 ]
 
 # this list used in math_op_patch.py for magic_method bind
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index f764fbb45996d..3f543ea29d003 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -39,6 +39,7 @@
     _get_paddle_place,
     convert_np_dtype_to_dtype_,
     core,
+    dygraph_only,
     in_dynamic_mode,
     in_dynamic_or_pir_mode,
     in_pir_mode,
@@ -2655,3 +2656,71 @@ def polar(abs, angle, name=None):
     )
 
     return paddle.complex(abs * paddle.cos(angle), abs * paddle.sin(angle))
+
+
+@dygraph_only
+def cauchy_(x, loc=0, scale=1, name=None):
+    """Fills the tensor with numbers drawn from the Cauchy distribution.
+
+    Args:
+        x (Tenosr): the tensor will be filled, The data type is float32 or float64.
+        loc (scalar, optional):  Location of the peak of the distribution. The data type is float32 or float64.
+        scale (scalar, optional): The half-width at half-maximum (HWHM). The data type is float32 or float64. Must be positive values.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+
+    Returns:
+        Tensor: input tensor with numbers drawn from the Cauchy distribution.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> x = paddle.randn([3, 4])
+            >>> x.cauchy_(1, 2)
+            >>> # doctest: +SKIP('random check')
+            >>> print(x)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 3.80087137,  2.25415039,  2.77960515,  7.64125967],
+             [ 0.76541221,  2.74023032,  1.99383152, -0.12685823],
+             [ 1.45228469,  1.76275957, -4.30458832, 34.74880219]])
+
+    """
+    x.normal_()
+    loc = paddle.to_tensor(loc).astype(x.dtype)
+    half = paddle.to_tensor(0.5).astype(x.dtype)
+    x.subtract_(half).scale_(np.pi).tan_().scale_(scale).add_(loc)
+    return x
+
+
+@dygraph_only
+def geometric_(x, probs, name=None):
+    """Fills the tensor with numbers drawn from the Geometric distribution.
+
+    Args:
+        x (Tenosr): the tensor will be filled, The data type is float32 or float64.
+        probs (Real|Tensor): Probability parameter.
+            The value of probs must be positive. When the parameter is a tensor, probs is probability of success for each trial.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+
+    Returns:
+        Tensor: input tensor with numbers drawn from the Geometric distribution.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> x = paddle.randn([3, 4])
+            >>> x.geometric_(0.3)
+            >>> # doctest: +SKIP('random check')
+            >>> print(x)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[2.42739224, 4.78268528, 1.23302543, 3.76555204],
+             [1.38877118, 0.16075331, 0.16401523, 2.47349310],
+             [1.72872102, 2.76533413, 0.33410925, 1.63351011]])
+
+    """
+    tiny = np.finfo(dtype=convert_dtype(x.dtype)).tiny
+    probs = paddle.to_tensor(probs).astype(x.dtype)
+    x.uniform_(min=float(tiny), max=float(1))
+    x.log_().divide_(paddle.log1p(-(probs)))
+    return x
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 286dcd261d8fe..acc59d6385e57 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -17,6 +17,7 @@
 import paddle
 from paddle import _C_ops
 from paddle.common_ops_import import VarDesc
+from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
 from ..base.data_feeder import check_dtype, check_type, check_variable_and_dtype
 from ..common_ops_import import Variable
@@ -130,6 +131,16 @@ def transpose(x, perm, name=None):
         return out
 
 
+@inplace_apis_in_dygraph_only
+def transpose_(x, perm, name=None):
+    r"""
+    Inplace version of ``transpose`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_paddle_transpose`.
+    """
+    if in_dynamic_mode():
+        return _C_ops.transpose_(x, perm)
+
+
 def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
     Applies matrix multiplication to two tensors. `matmul` follows
@@ -1394,6 +1405,27 @@ def t(input, name=None):
         return out
 
 
+@inplace_apis_in_dygraph_only
+def t_(input, name=None):
+    r"""
+    Inplace version of ``t`` API, the output Tensor will be inplaced with input ``input``.
+    Please refer to :ref:`api_paddle_t`.
+    """
+    if len(input.shape) > 2:
+        raise ValueError(
+            "Input(input) only support N-D (N<=2) tensor, but received "
+            "length of Input(input) is %s. Perhaps you can use paddle."
+            "tensor.transpose() instead." % len(input.shape)
+        )
+    if in_dynamic_mode():
+        if len(input.shape) <= 1:
+            return input
+        # 2-D tensor
+        perm = [1, 0]
+        out = _C_ops.transpose_(input, perm)
+        return out
+
+
 def cross(x, y, axis=9, name=None):
     """
     Computes the cross product between two tensors along an axis.
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index f32978ca50706..46ee4ff6920b9 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -394,6 +394,40 @@ def gaussian(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None):
         return out
 
 
+@dygraph_only
+def gaussian_(x, mean=0.0, std=1.0, seed=0, name=None):
+    """
+    This is the inplace version of OP ``gaussian``, which returns a Tensor filled
+    with random values sampled from a gaussian distribution. The output Tensor will
+    be inplaced with input ``x``. Please refer to :ref:`api_tensor_gaussian`.
+
+    Args:
+        x(Tensor): The input tensor to be filled with random values.
+        mean (float|int, optional): Mean of the output tensor, default is 0.0.
+        std (float|int, optional): Standard deviation of the output tensor, default
+            is 1.0.
+        seed (int, optional): Random seed of generator.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns:
+        Tensor: The input tensor x filled with random values sampled from a gaussian
+        distribution.
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> x = paddle.randn([3, 4])
+            >>> paddle.tensor.random.gaussian_(x)
+            >>> print(x)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [[ 0.86384124,  0.67328387,  0.21874231, -0.12615913],
+                [ 0.69844258,  0.42084831, -0.42476156, -0.00072985],
+                [ 1.72819555,  1.87785017,  0.48915744,  0.09235018]])
+    """
+    return _C_ops.gaussian_inplace_(x, float(mean), float(std), int(seed))
+
+
 def standard_normal(shape, dtype=None, name=None):
     """
     Returns a Tensor filled with random values sampled from a standard
@@ -627,6 +661,45 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
     return out
 
 
+@dygraph_only
+def normal_(x, mean=0.0, std=1.0, name=None):
+    """
+    This is the inplace version of api ``normal``, which returns a Tensor filled
+    with random values sampled from a normal distribution. The output Tensor will
+    be inplaced with input ``x``. Please refer to :ref:`api_tensor_noraml`.
+
+    Args:
+        x(Tensor): The input tensor to be filled with random values.
+        mean (float|Tensor, optional): The mean of the output Tensor's normal distribution.
+            If ``mean`` is float, all elements of the output Tensor shared the same mean.
+            If ``mean`` is a Tensor(data type supports float32, float64), it has per-element means.
+            Default is 0.0
+        std (float|Tensor, optional): The  standard deviation of the output Tensor's normal distribution.
+            If ``std`` is float, all elements of the output Tensor shared the same standard deviation.
+            If ``std`` is a Tensor(data type supports float32, float64), it has per-element standard deviations.
+            Defaule is 1.0
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns:
+        A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` .
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> x = paddle.randn([3, 4])
+            >>> x.normal_()
+            >>> # doctest: +SKIP('random check')
+            >>> print(x)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[ 0.06132207,  1.11349595,  0.41906244, -0.24858207],
+             [-1.85169315, -1.50370061,  1.73954511,  0.13331604],
+             [ 1.66359663, -0.55764782, -0.59911072, -0.57773495]])
+
+    """
+    return gaussian_(x, mean=mean, std=std)
+
+
 def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     """
     Returns a Tensor filled with random values sampled from a uniform
diff --git a/test/legacy_test/test_cauchy_inplace.py b/test/legacy_test/test_cauchy_inplace.py
new file mode 100644
index 0000000000000..7c2b05bc64729
--- /dev/null
+++ b/test/legacy_test/test_cauchy_inplace.py
@@ -0,0 +1,139 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+
+class TestCauchyInplaceDtype(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_cauchytype(self):
+        def test_fp32():
+            tensor_fp32 = paddle.ones(self.shape, dtype=paddle.float32)
+            tensor_fp32.cauchy_()
+            self.assertEqual(tensor_fp32.dtype, paddle.float32)
+
+        def test_fp64():
+            tensor_fp64 = paddle.ones(self.shape, paddle.float64)
+            tensor_fp64.cauchy_()
+            self.assertEqual(tensor_fp64.dtype, paddle.float64)
+
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            test_fp32()
+            test_fp64()
+
+
+class TestCauchyIsInplace(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_cauchy_inplace_op_is_inplace(self):
+        tensor_a = paddle.ones(self.shape)
+        tensor_b = tensor_a.cauchy_()
+        self.assertTrue(tensor_a is tensor_b)
+
+
+class TestCauchyInplaceSeedIsZero(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_cauchy_inplace_op_not_equal(self):
+        tensor = paddle.ones(self.shape)
+        tensor.cauchy_()
+        tensor_data_first = tensor.numpy()
+        tensor.cauchy_()
+        tensor_data_second = tensor.numpy()
+        self.assertFalse((tensor_data_first == tensor_data_second).all())
+
+
+class TestCauchyInplaceOpShape(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_cauchy_inplace_op_shape(self):
+        tensor = paddle.ones(self.shape)
+        tensor.cauchy_()
+        tensor_shape_np = np.array(tensor.shape)
+        origin_shape = np.array(self.shape)
+        self.assertTrue((tensor_shape_np == origin_shape).all())
+
+
+class TestCauchyInplaceDistribution(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+        self.loc = -3
+        self.scale = 5
+
+    def test_cauchy_inplace_distribution(self):
+        tensor = paddle.ones(self.shape)
+        tensor.cauchy_(loc=self.loc, scale=self.scale)
+        median = tensor.median()
+        np.testing.assert_allclose(median, self.loc, atol=1e-1)
+
+
+class TestCauchyInplaceEmptyTensor(unittest.TestCase):
+    def test_cauchy_inplace_op_empty_tensor(self):
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        test_shapes = [(200, 1), (1, 200)]
+        for place in places:
+            paddle.set_device(place)
+            for test_shape in test_shapes:
+                tensor = paddle.empty(shape=test_shape)
+                tensor.cauchy_()
+                tensor_shape_np = np.array(tensor.shape)
+                origin_shape = np.array(test_shape)
+                self.assertTrue((tensor_shape_np == origin_shape).all())
+
+
+class TestCauchyInplaceGrad(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def run_(self):
+        def test_grad():
+            tensor_a = paddle.ones(self.shape)
+            tensor_a.stop_gradient = False
+            tensor_b = tensor_a * 0.5
+            tensor_b.retain_grads()
+            tensor_b.cauchy_()
+            loss = tensor_b.sum()
+            loss.backward()
+            cauchy_grad = tensor_b.grad.numpy()
+            self.assertTrue((cauchy_grad == 0).all())
+
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            test_grad()
+
+    def test_cauchy_inplace_grad(self):
+        self.run_()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_geometric_inplace.py b/test/legacy_test/test_geometric_inplace.py
new file mode 100644
index 0000000000000..20f39621f2490
--- /dev/null
+++ b/test/legacy_test/test_geometric_inplace.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import scipy.stats
+
+import paddle
+from paddle import base
+
+
+class TestGeometricInplaceDtype(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_geometrictype(self):
+        def test_fp32():
+            tensor_fp32 = paddle.ones(self.shape, dtype=paddle.float32)
+            tensor_fp32.geometric_(probs=0.3)
+            self.assertEqual(tensor_fp32.dtype, paddle.float32)
+
+        def test_fp64():
+            tensor_fp64 = paddle.ones(self.shape, paddle.float64)
+            tensor_fp64.geometric_(probs=0.3)
+            self.assertEqual(tensor_fp64.dtype, paddle.float64)
+
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            test_fp32()
+            test_fp64()
+
+
+class TestGeometricIsInplace(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_geometric_inplace_op_is_inplace(self):
+        tensor_a = paddle.ones(self.shape)
+        tensor_b = tensor_a.geometric_(probs=0.3)
+        self.assertTrue(tensor_a is tensor_b)
+
+
+class TestGeometricInplaceSeedIsZero(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_geometric_inplace_op_not_equal(self):
+        tensor = paddle.ones(self.shape)
+        tensor.geometric_(probs=0.3)
+        tensor_data_first = tensor.numpy()
+        tensor.geometric_(probs=0.3)
+        tensor_data_second = tensor.numpy()
+        self.assertFalse((tensor_data_first == tensor_data_second).all())
+
+
+class TestGeometricInplaceOpShape(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_geometric_inplace_op_shape(self):
+        tensor = paddle.ones(self.shape)
+        tensor.geometric_(probs=0.3)
+        tensor_shape_np = np.array(tensor.shape)
+        origin_shape = np.array(self.shape)
+        self.assertTrue((tensor_shape_np == origin_shape).all())
+
+
+class TestGeometricInplaceDistribution(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+        self.probs = 0.3
+
+    def test_geometric_inplace_distribution(self):
+        a = paddle.ones(self.shape)
+        a.geometric_(self.probs)
+        np.testing.assert_allclose(
+            a.mean(axis=0), scipy.stats.geom.mean(self.probs), rtol=0.7, atol=0
+        )
+        np.testing.assert_allclose(
+            a.var(axis=0), scipy.stats.geom.var(self.probs), rtol=0.7, atol=0
+        )
+
+
+class TestGeometricInplaceEmptyTensor(unittest.TestCase):
+    def test_geometric_inplace_op_empty_tensor(self):
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        test_shapes = [(200, 1), (1, 200)]
+        for place in places:
+            paddle.set_device(place)
+            for test_shape in test_shapes:
+                tensor = paddle.empty(shape=test_shape)
+                tensor.geometric_(probs=0.3)
+                tensor_shape_np = np.array(tensor.shape)
+                origin_shape = np.array(test_shape)
+                self.assertTrue((tensor_shape_np == origin_shape).all())
+
+
+class TestGeometricInplaceGrad(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def run_(self):
+        def test_grad():
+            tensor_a = paddle.ones(self.shape)
+            tensor_a.stop_gradient = False
+            tensor_b = tensor_a * 0.5
+            tensor_b.retain_grads()
+            tensor_b.geometric_(probs=0.3)
+            loss = tensor_b.sum()
+            loss.backward()
+            geometric_grad = tensor_b.grad.numpy()
+            self.assertTrue((geometric_grad == 0).all())
+
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            test_grad()
+
+    def test_geometric_inplace_grad(self):
+        self.run_()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py
index 676977ba2ac48..e3f1de1048e11 100644
--- a/test/legacy_test/test_inplace.py
+++ b/test/legacy_test/test_inplace.py
@@ -1434,5 +1434,53 @@ def non_inplace_api_processing(self, var):
         return paddle.multiply(var, self.y)
 
 
+class TestDygrapInplaceT(TestDygraphInplaceWithContinuous):
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(-5, 5, [10, 20])
+        self.dtype = "float32"
+
+    def inplace_api_processing(self, var):
+        return paddle.t_(var)
+
+    def non_inplace_api_processing(self, var):
+        return paddle.t(var)
+
+    def test_forward_version(self):
+        with paddle.base.dygraph.guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            self.assertEqual(var.inplace_version, 0)
+
+            inplace_var = self.inplace_api_processing(var)
+            self.assertEqual(var.inplace_version, 1)
+
+            inplace_var[0] = 2
+            self.assertEqual(var.inplace_version, 1)
+
+            inplace_var = self.inplace_api_processing(inplace_var)
+            self.assertEqual(var.inplace_version, 2)
+
+
+class TestDygrapInplaceTranspose(TestDygraphInplaceWithContinuous):
+    def inplace_api_processing(self, var):
+        return paddle.transpose_(var, [1, 0, 2])
+
+    def non_inplace_api_processing(self, var):
+        return paddle.transpose(var, [1, 0, 2])
+
+    def test_forward_version(self):
+        with paddle.base.dygraph.guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            self.assertEqual(var.inplace_version, 0)
+
+            inplace_var = self.inplace_api_processing(var)
+            self.assertEqual(var.inplace_version, 1)
+
+            inplace_var[0] = 2
+            self.assertEqual(var.inplace_version, 1)
+
+            inplace_var = self.inplace_api_processing(inplace_var)
+            self.assertEqual(var.inplace_version, 2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_normal_inplace.py b/test/legacy_test/test_normal_inplace.py
new file mode 100644
index 0000000000000..dc693a6652561
--- /dev/null
+++ b/test/legacy_test/test_normal_inplace.py
@@ -0,0 +1,156 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import base
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, range=(-1, 1))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones(10)
+    return hist, prob
+
+
+class TestNormalRandomInplaceOpDtype(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_normal_inplace_op_dtype(self):
+        def test_fp32():
+            tensor_fp32 = paddle.ones(self.shape, dtype=paddle.float32)
+            tensor_fp32.normal_()
+            self.assertEqual(tensor_fp32.dtype, paddle.float32)
+
+        def test_fp64():
+            tensor_fp64 = paddle.ones(self.shape, paddle.float64)
+            tensor_fp64.normal_()
+            self.assertEqual(tensor_fp64.dtype, paddle.float64)
+
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            test_fp32()
+            test_fp64()
+
+
+class TestNormalRandomInplaceOpIsInplace(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_normal_inplace_op_is_inplace(self):
+        tensor_a = paddle.ones(self.shape)
+        tensor_b = tensor_a.normal_()
+        self.assertTrue(tensor_a is tensor_b)
+
+
+class TestNormalRandomInplaceOpSeedIsZero(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_normal_inplace_op_not_equal(self):
+        tensor = paddle.ones(self.shape)
+        tensor.normal_()
+        tensor_data_first = tensor.numpy()
+        tensor.normal_()
+        tensor_data_second = tensor.numpy()
+        self.assertFalse((tensor_data_first == tensor_data_second).all())
+
+
+class TestNormalRandomInplaceOpShape(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def test_normal_inplace_op_shape(self):
+        tensor = paddle.ones(self.shape)
+        tensor.normal_()
+        tensor_shape_np = np.array(tensor.shape)
+        origin_shape = np.array(self.shape)
+        self.assertTrue((tensor_shape_np == origin_shape).all())
+
+
+class TestNormalRandomInplaceOpDistribution(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+        self.mean = -3
+        self.std = 5
+
+    def test_normal_inplace_op_distribution(self):
+        tensor = paddle.ones(self.shape)
+        tensor.normal_(self.mean, self.std)
+        ones = paddle.ones(self.shape)
+        zeros = paddle.zeros(self.shape)
+        all_num = self.shape[0] * self.shape[1]
+
+        std_probs = [0.68, 0.95, 0.997]
+        for index, prob in enumerate(std_probs):
+            left = self.mean - (index + 1) * self.std
+            right = self.mean + (index + 1) * self.std
+            cond = paddle.logical_and(tensor >= left, tensor <= right)
+            c_sum = paddle.where(cond, ones, zeros).sum()
+            np.testing.assert_allclose((c_sum / all_num), prob, 1e-2)
+
+
+class TestNormalRandomInplaceOpEmptyTensor(unittest.TestCase):
+    def test_normal_inplace_op_empty_tensor(self):
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        test_shapes = [(200, 0), (0, 200)]
+        for place in places:
+            paddle.set_device(place)
+            for test_shape in test_shapes:
+                tensor = paddle.empty(shape=test_shape)
+                tensor.normal_()
+                tensor_shape_np = np.array(tensor.shape)
+                origin_shape = np.array(test_shape)
+                self.assertTrue((tensor_shape_np == origin_shape).all())
+
+
+class TestNormalRandomInplaceGrad(unittest.TestCase):
+    def setUp(self):
+        self.shape = (1000, 784)
+
+    def run_(self):
+        def test_grad():
+            tensor_a = paddle.ones(self.shape)
+            tensor_a.stop_gradient = False
+            tensor_b = tensor_a * 0.5
+            tensor_b.retain_grads()
+            tensor_b.normal_(mean=-2, std=2)
+            loss = tensor_b.sum()
+            loss.backward()
+            normal_grad = tensor_b.grad.numpy()
+            self.assertTrue((normal_grad == 0).all())
+
+        places = ['cpu']
+        if base.core.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            test_grad()
+
+    def test_normal_inplace_grad(self):
+        self.run_()
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2552742f6aea9db96d76c40a556cbf94762da7b6 Mon Sep 17 00:00:00 2001
From: yinwei <1871465933@qq.com>
Date: Fri, 22 Sep 2023 11:23:27 +0800
Subject: [PATCH 058/115] update solve_grad_kernel_impl.h for compiling
 optimization (#57603)

---
 paddle/phi/kernels/impl/solve_grad_kernel_impl.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
index a661035ab5b74..9c5394e002201 100644
--- a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/solve_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
 #include "paddle/phi/kernels/squeeze_kernel.h"
 #include "paddle/phi/kernels/unsqueeze_kernel.h"
 
@@ -63,8 +64,8 @@ struct ReduceSumForSolvelGrad<GPUContext, T> {
                   DenseTensor* output,
                   const std::vector<int>& reduce_dims,
                   bool keep_dims) {
-    phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-        dev_ctx, input, output, kps::IdentityFunctor<T>(), reduce_dims);
+    phi::SumKernel<T, GPUContext>(
+        dev_ctx, input, reduce_dims, input.dtype(), false, output);
   }
 };
 #endif

From 1e1b4639f0625fedcb8e7afbfb231f00e98ebc94 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Fri, 22 Sep 2023 11:24:06 +0800
Subject: [PATCH 059/115] Support float64 for TruncatedNormal and Assign
 (#57507)

* add float64 for API TruncatedNormal in GPU and CPU

* add float64 for API Assign in GPU and CPU

* remove _fp32 for 2 UT function

* add fp64 in static_ops.yaml

* remove TestAssignValueOp5

* add TestTruncatedNormalInitializerDygraph

* add unitest for Assign

* derived from unitest.TestCase

* update unitest

* add restore dtype code for unitest

* use dygraph_guard

* update fp64 for assign_value op maker

* update op_translator.cc

* update code

* update UT code

* remove reduncant code in paddle/fluid/ir_adaptor/translator/op_translator.cc
---
 paddle/fluid/operators/assign_value_op.h      |   5 +-
 paddle/phi/api/yaml/static_ops.yaml           |   2 +-
 paddle/phi/kernels/assign_kernel.cc           |   2 +
 .../cpu/truncated_gaussian_random_kernel.cc   |   3 +-
 .../gpu/truncated_gaussian_random_kernel.cu   |   3 +-
 paddle/phi/ops/compat/assign_value_sig.cc     |   3 +
 python/paddle/nn/initializer/assign.py        |   3 +
 test/legacy_test/test_initializer.py          | 240 +++++++++++++++++-
 test/legacy_test/test_initializer_nn.py       |   7 +
 .../test_truncated_gaussian_random_op.py      |  38 ++-
 10 files changed, 285 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index 2522fa580f758..f5b74c5441174 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -110,6 +110,9 @@ class AssignValueKernel : public framework::OpKernel<T> {
       case framework::proto::VarType::FP32:
         value_name = "fp32_values";
         break;
+      case framework::proto::VarType::FP64:
+        value_name = "fp64_values";
+        break;
       case framework::proto::VarType::INT64:
         value_name = "int64_values";
       case framework::proto::VarType::INT8:
@@ -118,7 +121,7 @@ class AssignValueKernel : public framework::OpKernel<T> {
       default:
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported data type(code %d) for AssignValue operator, only "
-            "supports bool, int32, float32, int8 and int64.",
+            "supports bool, int32, float32, float64, int8 and int64.",
             dtype));
         break;
     }
diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml
index fc118c8a401de..9f8def740385b 100755
--- a/paddle/phi/api/yaml/static_ops.yaml
+++ b/paddle/phi/api/yaml/static_ops.yaml
@@ -90,7 +90,7 @@
   backward : assign_grad
 
 - op : assign_value
-  args : (int[] shape, DataType dtype, int[] bool_values = {}, float[] fp32_values = {}, int[] int32_values = {}, int64_t[] int64_values = {})
+  args : (int[] shape, DataType dtype, int[] bool_values = {}, float[] fp32_values = {}, double[] fp64_values = {}, int[] int32_values = {}, int64_t[] int64_values = {})
   output : Tensor(out)
   infer_meta :
     func : AssignValueInferMeta
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index db30ec7389619..b828aefa012a7 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -132,6 +132,7 @@ PD_REGISTER_KERNEL(assign_value,
                    bool,
                    int,
                    float,
+                   double,
                    int8_t,
                    int64_t) {}
 
@@ -159,6 +160,7 @@ PD_REGISTER_KERNEL(assign_value,
                    bool,
                    int,
                    float,
+                   double,
                    int8_t,
                    int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
index 53346d92e78bf..d7efb76b4bf0e 100644
--- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc
@@ -59,4 +59,5 @@ PD_REGISTER_KERNEL(truncated_gaussian_random,
                    CPU,
                    ALL_LAYOUT,
                    phi::TruncatedGaussianRandomKernel,
-                   float) {}
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 698dcc20ad3fe..a7278302cf4e0 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -124,4 +124,5 @@ PD_REGISTER_KERNEL(truncated_gaussian_random,
                    GPU,
                    ALL_LAYOUT,
                    phi::TruncatedGaussianRandomKernel,
-                   float) {}
+                   float,
+                   double) {}
diff --git a/paddle/phi/ops/compat/assign_value_sig.cc b/paddle/phi/ops/compat/assign_value_sig.cc
index 0fa1889ccde34..977c2260e59b9 100644
--- a/paddle/phi/ops/compat/assign_value_sig.cc
+++ b/paddle/phi/ops/compat/assign_value_sig.cc
@@ -33,6 +33,9 @@ KernelSignature AssignValueOpArgumentMapping(
   } else if (dtype == /*FP32*/ 5) {
     return KernelSignature(
         "assign_value", {}, {"shape", "dtype", "fp32_values"}, {"Out"});
+  } else if (dtype == /*FP64*/ 6) {
+    return KernelSignature(
+        "assign_value", {}, {"shape", "dtype", "fp64_values"}, {"Out"});
   } else if (dtype == /*INT64*/ 3) {
     return KernelSignature(
         "assign_value", {}, {"shape", "dtype", "int64_values"}, {"Out"});
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index a1cd06cab59b4..9f9947ccc6ed8 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -78,6 +78,9 @@ def forward(self, var, block=None):
         if out_dtype == core.VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in np_value.flat]
+        elif out_dtype == core.VarDesc.VarType.FP64:
+            value_name = "fp64_values"
+            values = [float(v) for v in np_value.flat]
         elif out_dtype == core.VarDesc.VarType.INT32:
             value_name = "int32_values"
             values = [int(v) for v in np_value.flat]
diff --git a/test/legacy_test/test_initializer.py b/test/legacy_test/test_initializer.py
index 52b2e4d5024dd..903f47671549e 100644
--- a/test/legacy_test/test_initializer.py
+++ b/test/legacy_test/test_initializer.py
@@ -16,6 +16,8 @@
 import unittest
 
 import numpy as np
+from scipy import special
+from utils import dygraph_guard, static_guard
 
 import paddle
 from paddle import base
@@ -796,7 +798,7 @@ def test_order(self):
         paddle.set_device('cpu')
         SEED = 123
         weight_attr = paddle.framework.ParamAttr(
-            name="linear_weight",
+            name="linear_weight2",
             learning_rate=1.0,
             trainable=False,
             regularizer=None,
@@ -805,7 +807,7 @@ def test_order(self):
             ),
         )
         bias_attr = paddle.framework.ParamAttr(
-            name="linear_bias",
+            name="linear_bias2",
             learning_rate=1.0,
             trainable=False,
             regularizer=None,
@@ -815,16 +817,32 @@ def test_order(self):
         )
 
         def run_dynamic_graph():
-            paddle.disable_static()
             paddle.seed(SEED)
             linear = paddle.nn.Linear(
-                1, 1, weight_attr=weight_attr, bias_attr=bias_attr
+                1,
+                1,
+                weight_attr=paddle.framework.ParamAttr(
+                    name="linear_weight1",
+                    learning_rate=1.0,
+                    trainable=False,
+                    regularizer=None,
+                    initializer=paddle.nn.initializer.TruncatedNormal(
+                        mean=0.0, std=2.0
+                    ),
+                ),
+                bias_attr=paddle.framework.ParamAttr(
+                    name="linear_bias1",
+                    learning_rate=1.0,
+                    trainable=False,
+                    regularizer=None,
+                    initializer=paddle.nn.initializer.TruncatedNormal(
+                        mean=0.0, std=2.0
+                    ),
+                ),
             )
             return linear.weight.numpy(), linear.bias.numpy()
-            paddle.enable_static()
 
         def run_static_graph():
-            paddle.enable_static()
             exe = paddle.static.Executor(paddle.CPUPlace())
             paddle.seed(SEED)
             linear = paddle.nn.Linear(
@@ -832,16 +850,93 @@ def run_static_graph():
             )
             res = exe.run(
                 paddle.static.default_startup_program(),
-                fetch_list=['linear_weight', 'linear_bias'],
+                fetch_list=['linear_weight2', 'linear_bias2'],
             )
             return res[0], res[1]
 
-        dynamic_res = run_dynamic_graph()
-        static_res = run_static_graph()
+        with dygraph_guard():
+            dynamic_res = run_dynamic_graph()
+        with static_guard():
+            static_res = run_static_graph()
 
         np.testing.assert_array_equal(dynamic_res[0], static_res[0])
         np.testing.assert_array_equal(dynamic_res[1], static_res[1])
 
+    def test_assign_static_fp32(self):
+        random_value = np.random.randn(128, 128).astype("float32")
+
+        def run_dynamic_graph(dtype):
+            with dygraph_guard():
+                w = paddle.create_parameter(
+                    random_value.shape,
+                    dtype,
+                    default_initializer=paddle.nn.initializer.Assign(
+                        random_value
+                    ),
+                )
+            return w
+
+        def run_static_graph(dtype):
+            with static_guard():
+                exe = paddle.static.Executor(paddle.CPUPlace())
+                w = paddle.create_parameter(
+                    random_value.shape,
+                    dtype,
+                    "w",
+                    default_initializer=paddle.nn.initializer.Assign(
+                        random_value
+                    ),
+                )
+                res = exe.run(
+                    paddle.static.default_startup_program(),
+                    fetch_list=['w'],
+                )
+            return res[0]
+
+        dynamic_res = run_dynamic_graph("float32")
+        static_res = run_static_graph("float32")
+
+        np.testing.assert_array_equal(dynamic_res.numpy(), static_res)
+        np.testing.assert_array_equal(dynamic_res.numpy(), static_res)
+
+    def test_assign_static_fp64(self):
+        random_value = np.random.randn(128, 128).astype("float64")
+
+        def run_dynamic_graph(dtype):
+            with dygraph_guard():
+                w = paddle.create_parameter(
+                    random_value.shape,
+                    dtype,
+                    "www",
+                    default_initializer=paddle.nn.initializer.Assign(
+                        random_value
+                    ),
+                )
+            return w
+
+        def run_static_graph(dtype):
+            with static_guard():
+                exe = paddle.static.Executor(paddle.CPUPlace())
+                w = paddle.create_parameter(
+                    random_value.shape,
+                    dtype,
+                    "ww",
+                    default_initializer=paddle.nn.initializer.Assign(
+                        random_value
+                    ),
+                )
+                res = exe.run(
+                    paddle.static.default_startup_program(),
+                    fetch_list=['ww'],
+                )
+            return res[0]
+
+        dynamic_res = run_dynamic_graph("float64")
+        static_res = run_static_graph("float64")
+
+        np.testing.assert_array_equal(dynamic_res.numpy(), static_res)
+        np.testing.assert_array_equal(dynamic_res.numpy(), static_res)
+
 
 # 2-D Parameter with shape: [10, 15]
 class TestOrthogonalInitializer1(unittest.TestCase):
@@ -1197,6 +1292,133 @@ def test_type_error(self):
         )
 
 
+class TestTruncatedNormalInitializerDygraph(unittest.TestCase):
+    def _trunc_normal_numpy(self, tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+        # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+        def norm_cdf(x):
+            # Computes standard normal cumulative distribution function
+            return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        _tensor = np.random.uniform(
+            low=2 * l - 1, high=2 * u - 1, size=tensor.shape
+        ).astype(paddle.get_default_dtype())
+        print(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        _tensor = special.erfinv(_tensor)
+
+        # Transform to proper mean, std
+        _tensor = np.multiply(_tensor, std * math.sqrt(2.0))
+        _tensor = np.add(_tensor, mean)
+
+        # Clamp to ensure it"s in the proper range
+        _tensor = np.clip(_tensor, a_min=a, a_max=b)
+        return _tensor
+
+    def test_truncated_normal_initializer_fp32(self):
+        """
+        In dygraph mode, we can use initializer directly to initialize a tensor.
+        """
+        with dygraph_guard():
+            paddle.seed(42)
+            pre_dtype = paddle.get_default_dtype()
+            paddle.set_default_dtype("float32")
+
+            tensor = paddle.zeros([1024, 1024, 8])
+            tensor.stop_gradient = False
+
+            truncated_normal_ = paddle.nn.initializer.TruncatedNormal()
+            truncated_normal_(tensor)
+
+            array = self._trunc_normal_numpy(tensor)
+            np.testing.assert_allclose(
+                array.mean(), tensor.mean().item(), rtol=0.01, atol=0.01
+            )
+            np.testing.assert_allclose(
+                array.std(), tensor.std().item(), rtol=0.01, atol=0.01
+            )
+            paddle.set_default_dtype(pre_dtype)
+
+    def test_truncated_normal_initializer_fp64(self):
+        """
+        In dygraph mode, we can use initializer directly to initialize a tensor.
+        """
+        with dygraph_guard():
+            paddle.seed(42)
+            pre_dtype = paddle.get_default_dtype()
+            paddle.set_default_dtype("float64")
+
+            tensor = paddle.zeros([1024, 1024, 8])
+            tensor.stop_gradient = False
+
+            truncated_normal_ = paddle.nn.initializer.TruncatedNormal()
+            truncated_normal_(tensor)
+
+            array = self._trunc_normal_numpy(tensor)
+            np.testing.assert_allclose(
+                array.mean(), tensor.mean().item(), rtol=0.01, atol=0.01
+            )
+            np.testing.assert_allclose(
+                array.std(), tensor.std().item(), rtol=0.01, atol=0.01
+            )
+            paddle.set_default_dtype(pre_dtype)
+
+
+class TestAssignInitializerDygraph(unittest.TestCase):
+    def test_assign_initializer_fp32(self):
+        """
+        In dygraph mode, we can use initializer directly to initialize a tensor.
+        """
+        with dygraph_guard():
+            pre_dtype = paddle.get_default_dtype()
+            paddle.set_default_dtype("float32")
+
+            tensor = paddle.zeros(
+                [1024, 1024, 8], dtype=paddle.get_default_dtype()
+            )
+            tensor.stop_gradient = False
+            array = np.random.randn(*tensor.shape).astype(
+                paddle.get_default_dtype()
+            )
+
+            assign_ = paddle.nn.initializer.Assign(array)
+            assign_(tensor)
+
+            np.testing.assert_allclose(array, tensor, rtol=1e-6, atol=1e-6)
+            paddle.set_default_dtype(pre_dtype)
+
+    def test_assign_initializer_fp64(self):
+        """
+        In dygraph mode, we can use initializer directly to initialize a tensor.
+        """
+        with dygraph_guard():
+            pre_dtype = paddle.get_default_dtype()
+            paddle.set_default_dtype("float64")
+
+            tensor = paddle.zeros(
+                [1024, 1024, 8], dtype=paddle.get_default_dtype()
+            )
+            tensor.stop_gradient = False
+            array = np.random.randn(*tensor.shape).astype(
+                paddle.get_default_dtype()
+            )
+
+            assign_ = paddle.nn.initializer.Assign(array)
+            assign_(tensor)
+
+            np.testing.assert_allclose(array, tensor, rtol=1e-6, atol=1e-6)
+            paddle.set_default_dtype(pre_dtype)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/test/legacy_test/test_initializer_nn.py b/test/legacy_test/test_initializer_nn.py
index b0b0e0bef268d..95c64ac648290 100644
--- a/test/legacy_test/test_initializer_nn.py
+++ b/test/legacy_test/test_initializer_nn.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from utils import static_guard
 
 import paddle
 from paddle import base, nn
@@ -490,6 +491,12 @@ def test_truncated_normal_initializer_bf16(self):
         block = self.test_truncated_normal_initializer("uint16")  # bfloat16
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_truncated_normal_initializer_fp64(self):
+        """Test truncated normal initializer with float64"""
+        with static_guard():
+            # Only test whether float64 data can be generated without error
+            _ = self.test_truncated_normal_initializer("float64")  # float64
+
     def test_truncated_normal_initializer_dygraph(self):
         """Test truncated normal initializer in dygraph model."""
         paddle.disable_static()
diff --git a/test/legacy_test/test_truncated_gaussian_random_op.py b/test/legacy_test/test_truncated_gaussian_random_op.py
index 0f56c5f9ef15e..eb8b502b082d4 100644
--- a/test/legacy_test/test_truncated_gaussian_random_op.py
+++ b/test/legacy_test/test_truncated_gaussian_random_op.py
@@ -35,20 +35,42 @@ def setUp(self):
         self.outputs = ["Out"]
 
     def test_cpu(self):
-        self.gaussian_random_test(place=base.CPUPlace())
-        self.gaussian_random_test_eager(place=base.CPUPlace())
+        self._gaussian_random_test(
+            place=base.CPUPlace(), dtype=core.VarDesc.VarType.FP32
+        )
+        self._gaussian_random_test(
+            place=base.CPUPlace(), dtype=core.VarDesc.VarType.FP64
+        )
+        self._gaussian_random_test_eager(
+            place=base.CPUPlace(), dtype=core.VarDesc.VarType.FP32
+        )
+        self._gaussian_random_test_eager(
+            place=base.CPUPlace(), dtype=core.VarDesc.VarType.FP64
+        )
 
     def test_gpu(self):
         if core.is_compiled_with_cuda():
-            self.gaussian_random_test(place=base.CUDAPlace(0))
-            self.gaussian_random_test_eager(place=base.CUDAPlace(0))
+            self._gaussian_random_test(
+                place=base.CUDAPlace(0), dtype=core.VarDesc.VarType.FP32
+            )
+            self._gaussian_random_test(
+                place=base.CUDAPlace(0), dtype=core.VarDesc.VarType.FP64
+            )
+            self._gaussian_random_test_eager(
+                place=base.CUDAPlace(0), dtype=core.VarDesc.VarType.FP32
+            )
+            self._gaussian_random_test_eager(
+                place=base.CUDAPlace(0), dtype=core.VarDesc.VarType.FP64
+            )
 
-    def gaussian_random_test(self, place):
+    def _gaussian_random_test(self, place, dtype):
         program = base.Program()
         block = program.global_block()
         vout = block.create_var(name="Out")
         op = block.append_op(
-            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs
+            type=self.op_type,
+            outputs={"Out": vout},
+            attrs={**self.attrs, "dtype": dtype},
         )
 
         op.desc.infer_var_type(block.desc)
@@ -66,14 +88,14 @@ def gaussian_random_test(self, place):
 
     # TruncatedNormal.__call__ has no return value, so here call _C_ops api
     # directly
-    def gaussian_random_test_eager(self, place):
+    def _gaussian_random_test_eager(self, place, dtype):
         with base.dygraph.guard(place):
             out = paddle._C_ops.truncated_gaussian_random(
                 self.attrs["shape"],
                 self.attrs["mean"],
                 self.attrs["std"],
                 self.attrs["seed"],
-                core.VarDesc.VarType.FP32,
+                dtype,
                 place,
             )
             self.assertAlmostEqual(numpy.mean(out.numpy()), 0.0, delta=0.1)

From bcc053dab7762f5967bec72a870e94b075186f22 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Fri, 22 Sep 2023 11:29:12 +0800
Subject: [PATCH 060/115] support seslected rows (#57451)

* support seslected rows
---
 .../ir_adaptor/translator/op_translator.cc    |  88 ++
 paddle/fluid/ir_adaptor/translator/utils.cc   |  13 +
 paddle/fluid/ir_adaptor/translator/utils.h    |  22 +
 .../fluid/pir/dialect/op_generator/op_gen.py  | 837 +++++++++++-------
 .../pir/dialect/operator/ir/manual_op.cc      |  12 +-
 paddle/fluid/pir/dialect/operator/ir/ops.yaml |   6 +
 .../operator/utils/op_yaml_info_util.h        |   4 +-
 .../pir/transforms/pd_op_to_kernel_pass.cc    |  54 +-
 paddle/phi/api/yaml/op_compat.yaml            |  12 +
 .../pattern_rewrite/pattern_rewrite_test.cc   |   2 +-
 .../test_new_ir_selectedrows.py               | 107 +++
 11 files changed, 804 insertions(+), 353 deletions(-)
 create mode 100644 test/dygraph_to_static/test_new_ir_selectedrows.py

diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index b11101de616b8..eac21fae5c7cd 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -225,6 +225,94 @@ pir::OpInfo OpTranscriber::LoopkUpOpInfo(pir::IrContext* ctx,
              target_op_name);
   }
 
+  if (!paddle::dialect::HaveOpToMultiKernelsMap(
+          OpNameCompatibleMapping(op_desc.Type()))) {
+    return op_info;
+  }
+
+  // for selected rows kernel choose
+  auto* op_info_concept =
+      op_info.GetInterfaceImpl<dialect::OpYamlInfoInterface>();
+
+  OpInputInfoList input_infos;
+  OpAttributeInfoList attr_infos;
+  OpOutputInfoList output_infos;
+  std::tie(input_infos, attr_infos, output_infos, std::ignore, std::ignore) =
+      op_info_concept->get_op_info_();
+
+  auto& op_normalizer = OpNameNormalizer::instance();
+  std::vector<std::string> need_inputs_sig;
+  for (const auto& info : input_infos) {
+    if (info.is_mutable_attribute) {
+      continue;
+    }
+    std::string legacy_input_name =
+        op_normalizer.GetLegacyArgName(op_desc.Type(), info.name);
+    auto legacy_input_vars = op_desc.Input(legacy_input_name, true);
+    IR_ENFORCE(legacy_input_vars.size() <= 1,
+               "Do not support duplicable tensor input, when op have multi "
+               "kernels. OP is %s",
+               op_desc.Type());
+
+    if (legacy_input_vars.empty()) {
+      need_inputs_sig.emplace_back("");
+      continue;
+    }
+    VarDesc* var = op_desc.Block()->FindVarRecursive(legacy_input_vars[0]);
+    if (var->GetType() == paddle::framework::proto::VarType::LOD_TENSOR) {
+      need_inputs_sig.emplace_back("dense");
+    } else if (var->GetType() ==
+               paddle::framework::proto::VarType::SELECTED_ROWS) {
+      need_inputs_sig.emplace_back("selected_rows");
+    } else {
+      IR_THROW("Op %d only support densetensor and selected_rows, but not %d",
+               op_desc.Type(),
+               var->GetType());
+    }
+  }
+
+  target_op_name = OpNameCompatibleMapping(op_desc.Type());
+
+  auto sig_infos = paddle::dialect::LegacyOpToPdOpsMapping(target_op_name);
+
+  target_op_name = "";
+  for (const auto& sig : sig_infos) {
+    if (need_inputs_sig.size() != sig.inputs.size()) {
+      continue;
+    }
+    size_t i;
+    for (i = 0; i < need_inputs_sig.size(); ++i) {
+      if (need_inputs_sig[i] == "") {
+        continue;
+      }
+      if (need_inputs_sig[i] != sig.inputs[i]) {
+        break;
+      }
+    }
+    if (i == need_inputs_sig.size()) {
+      target_op_name = sig.name;
+      break;
+    }
+  }
+
+  IR_ENFORCE(!target_op_name.empty(),
+             "Op %d should have corresponding OpInfo %d",
+             op_desc.Type(),
+             target_op_name);
+
+  target_op_name = kTargetDialectPrefix + target_op_name;
+  if (IsInplace(op_desc) && *target_op_name.rbegin() != '_') {
+    target_op_name += "_";
+  }
+  VLOG(6) << "[op name normalizing]: " << op_desc.Type() << " to "
+          << target_op_name;
+  op_info = ctx->GetRegisteredOpInfo(target_op_name);
+  if (!op_info) {
+    IR_THROW("Op %d should have corresponding OpInfo %d",
+             op_desc.Type(),
+             target_op_name);
+  }
+
   return op_info;
 }
 
diff --git a/paddle/fluid/ir_adaptor/translator/utils.cc b/paddle/fluid/ir_adaptor/translator/utils.cc
index 078f65f14389b..5ee0c91b5bae5 100644
--- a/paddle/fluid/ir_adaptor/translator/utils.cc
+++ b/paddle/fluid/ir_adaptor/translator/utils.cc
@@ -18,11 +18,24 @@
 
 #include "paddle/fluid/ir_adaptor/translator/op_translator.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/core/builtin_type.h"
 #include "paddle/pir/core/enforce.h"
 #include "paddle/pir/core/utils.h"
 
+namespace paddle {
+namespace dialect {
+bool HaveOpToMultiKernelsMap(std::string op_name) {
+  return op_to_multi_kernels_map.find(op_name) != op_to_multi_kernels_map.end();
+}
+
+const std::vector<PdOpSig>& LegacyOpToPdOpsMapping(std::string op_name) {
+  return op_to_multi_kernels_map[op_name];
+}
+}  // namespace dialect
+}  // namespace paddle
+
 namespace paddle {
 namespace translator {
 
diff --git a/paddle/fluid/ir_adaptor/translator/utils.h b/paddle/fluid/ir_adaptor/translator/utils.h
index a9698ef6292b5..8745ee2ac0d7b 100644
--- a/paddle/fluid/ir_adaptor/translator/utils.h
+++ b/paddle/fluid/ir_adaptor/translator/utils.h
@@ -23,6 +23,28 @@
 #include "paddle/pir/core/operation.h"
 #include "paddle/pir/core/program.h"
 
+namespace paddle {
+namespace dialect {
+struct PdOpSig {
+  std::string name;
+  std::vector<std::string> inputs;
+  std::vector<std::string> outputs;
+  PdOpSig() = default;
+  PdOpSig(const PdOpSig& input_info) = default;
+
+  PdOpSig(const std::string& name,
+          const std::vector<std::string>& inputs,
+          const std::vector<std::string>& outputs)
+      : name(name), inputs(inputs), outputs(outputs) {}
+};
+
+bool HaveOpToMultiKernelsMap(std::string op_name);
+
+const std::vector<PdOpSig>& LegacyOpToPdOpsMapping(std::string op_name);
+
+}  // namespace dialect
+}  // namespace paddle
+
 namespace paddle {
 namespace translator {
 
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 46949bcb547a7..8ae9cc587ca57 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -66,6 +66,9 @@
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/ir_adaptor/translator/utils.h"
+
+{op_to_multi_kernels_map}
 
 {input}
 
@@ -73,6 +76,10 @@
 #endif
 """
 
+OP_TO_MULTI_KERNELS_MAP_H = """
+extern std::unordered_map<std::string, std::vector<PdOpSig>> op_to_multi_kernels_map;
+"""
+
 GET_OP_LIST_TEMPALTE = """{}
 """
 
@@ -129,6 +136,8 @@ class {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
 #include "paddle/fluid/primitive/rule/vjp/vjp.h"
 #include "paddle/pir/core/op_base.h"
 
+{op_to_multi_kernels_map}
+
 {input}
 
 {define_type_id}
@@ -162,7 +171,7 @@ class {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
   std::vector<paddle::dialect::OpInputInfo> inputs = {{ {inputs} }};
   std::vector<paddle::dialect::OpAttributeInfo> attributes = {{ {attributes} }};
   std::vector<paddle::dialect::OpOutputInfo> outputs = {{ {outputs} }};
-  paddle::dialect::OpRunTimeInfo run_time_info = paddle::dialect::OpRunTimeInfo("{infer_meta_func}", {{"{infer_meta_param}"}}, {{"{kernel_func}"}}, {{"{kernel_param}"}}, {{{kernel_key_dtype}}}, {{{kernel_key_backend}}}, {{{inplace}}}, {{{view}}});
+  paddle::dialect::OpRunTimeInfo run_time_info = paddle::dialect::OpRunTimeInfo("{infer_meta_func}", {{"{infer_meta_param}"}}, "{kernel_func}", {{"{kernel_param}"}}, {{{kernel_key_dtype}}}, {{{kernel_key_backend}}}, {{{inplace}}}, {{{view}}});
   return std::make_tuple(inputs, attributes, outputs, run_time_info, "{origin_op_name}");
 }}
 """
@@ -175,6 +184,12 @@ class {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
 IR_DEFINE_EXPLICIT_TYPE_ID({op_name})
 """
 
+OP_TO_MULTI_KERNELS_MAPS = """
+std::unordered_map<std::string, std::vector<PdOpSig>> op_to_multi_kernels_map = {{
+{maps}
+}};
+"""
+
 scalar_type_maps = {
     'int': 'pir::Int32Attribute',
     'int64_t': 'pir::Int64Attribute',
@@ -245,9 +260,13 @@ def __init__(self, op_yaml_item, op_compat_item):
         self.op_yaml_item = op_yaml_item
         self.op_compat_item = op_compat_item
         self.op_phi_name = self.parse_op_phi_name()
+
+        self.kernel_map = self.parse_kernel_map()
+
         # parse inputs
         self.input_name_list = self.parse_input_name_list()
         self.input_type_list = self.parse_input_type_list()
+        self.input_type_dict = self.parse_input_type_dict()
         self.input_optional_list = self.parse_input_optional_list()
         self.input_no_need_buffer_list = self.parse_input_no_need_buffer_list()
         self.cross_check(
@@ -257,6 +276,7 @@ def __init__(self, op_yaml_item, op_compat_item):
         # parse outputs
         self.output_name_list = self.parse_output_name_list()
         self.output_type_list = self.parse_output_type_list()
+        self.output_type_dict = self.parse_output_type_dict()
         self.output_size_list = self.parse_output_size_list()
         self.output_optional_list = self.parse_output_optional_list()
         self.output_intermediate_list = self.parse_output_intermediate_list()
@@ -344,7 +364,6 @@ def __init__(self, op_yaml_item, op_compat_item):
 
         # parse infermeta && kernel
         self.infer_meta_map = self.parse_infer_meta_map()
-        self.kernel_map = self.parse_kernel_map()
         self.invoke_map = self.parse_invoke_map()
         if 'infer_meta' in self.op_yaml_item:
             self.infer_meta_func = self.op_yaml_item['infer_meta']["func"]
@@ -383,7 +402,7 @@ def cross_check(self, name_list, type_list, optional_list=None):
             type_list
         ), "name list size != type list size."
         if optional_list is not None:
-            assert len(type_list) == len(
+            assert len(name_list) == len(
                 optional_list
             ), "type list size != optional list size."
 
@@ -562,6 +581,48 @@ def parse_input_type_list(self):
             type_list.append(input_types_map[input_info['typename']])
         return type_list
 
+    def parse_input_type_dict(self):
+        type_dict = {}
+
+        if (
+            self.kernel_map is None
+            or self.kernel_map['dispatch'][self.kernel_map['func'][0]] is None
+        ):
+            input_types_map = {
+                'Tensor': 'paddle::dialect::DenseTensorType',
+                'Tensor[]': 'pir::VectorType<paddle::dialect::DenseTensorType>',
+            }
+            type_list = []
+            for input_info in self.op_yaml_item['inputs']:
+                assert (
+                    input_info['typename'] in input_types_map
+                ), f"{self.op_phi_name} : Input type error: the input type only support Tensor and Tensor[], but now is {input_info['typename']}."
+                type_list.append(input_types_map[input_info['typename']])
+
+            if self.kernel_map is None:
+                type_dict['default'] = type_list
+            else:
+                type_dict[self.kernel_map['func'][0]] = type_list
+
+        else:
+            input_types_map = {
+                'dense': 'paddle::dialect::DenseTensorType',
+                'selected_rows': 'paddle::dialect::SelectedRowsType',
+            }
+
+            for kernel_func_name in self.kernel_map['func']:
+                inputs = self.kernel_map['dispatch'][kernel_func_name][0]
+                type_list = []
+                for input_info in inputs:
+                    assert (
+                        input_info in input_types_map
+                    ), f"{self.op_phi_name} : Input type error: the input type only support dense and selected_rows, but now is {input_info}."
+                    type_list.append(input_types_map[input_info])
+
+                type_dict[kernel_func_name] = type_list
+
+        return type_dict
+
     def parse_input_optional_list(self):
         optional_list = []
         for input_info in self.op_yaml_item['inputs']:
@@ -600,6 +661,49 @@ def parse_output_type_list(self):
             type_list.append(output_type_map[output_info['typename']])
         return type_list
 
+    def parse_output_type_dict(self):
+        type_dict = {}
+
+        if (
+            self.kernel_map is None
+            or self.kernel_map['dispatch'][self.kernel_map['func'][0]] is None
+        ):
+            output_type_map = {
+                'Tensor': 'paddle::dialect::DenseTensorType',
+                'Tensor[]': 'pir::VectorType<paddle::dialect::DenseTensorType>',
+                'SelectedRows': 'paddle::dialect::SelectedRowsType',
+            }
+            type_list = []
+            for output_info in self.op_yaml_item['outputs']:
+                assert (
+                    output_info['typename'] in output_type_map
+                ), f"{self.op_phi_name} : Output type error: the output type only support Tensor and Tensor[], but now is {output_info['typename']}."
+                type_list.append(output_type_map[output_info['typename']])
+
+            if self.kernel_map is None:
+                type_dict['default'] = type_list
+            else:
+                type_dict[self.kernel_map['func'][0]] = type_list
+
+        else:
+            output_type_map = {
+                'dense': 'paddle::dialect::DenseTensorType',
+                'selected_rows': 'paddle::dialect::SelectedRowsType',
+            }
+
+            for kernel_func_name in self.kernel_map['func']:
+                outputs = self.kernel_map['dispatch'][kernel_func_name][1]
+                type_list = []
+                for output_info in outputs:
+                    assert (
+                        output_info in output_type_map
+                    ), f"{self.op_phi_name} : Input type error: the input type only support dense and selected_rows, but now is {output_info}."
+                    type_list.append(output_type_map[output_info])
+
+                type_dict[kernel_func_name] = type_list
+
+        return type_dict
+
     def parse_output_size_list(self):
         size_list = []
         for output_info in self.op_yaml_item['outputs']:
@@ -878,15 +982,14 @@ def OpGenerator(
     for custom_vjp in vjp_gen.CUSTOM_VJP:
         custom_vjp_op_name_list.append(custom_vjp[:-5])  # cut _grad
 
+    op_to_multi_kernels_list = []
     for key, op_info in op_info_items.items():
         # get op inputs info
         op_input_name_list = op_info.input_name_list
-        op_input_type_list = op_info.input_type_list
         op_input_optional_list = op_info.input_optional_list
         op_input_no_need_buffer_list = op_info.input_no_need_buffer_list
         # get op outputs info
         op_output_name_list = op_info.output_name_list
-        op_output_type_list = op_info.output_type_list
         op_output_size_list = op_info.output_size_list
         op_output_optional_list = op_info.output_optional_list
         op_output_intermediate_list = op_info.output_intermediate_list
@@ -953,105 +1056,76 @@ def OpGenerator(
         for op_name in op_info.op_phi_name:
             if op_name in PD_MANUAL_OP_LIST:
                 continue
-            op_class_name = to_pascal_case(op_name) + "Op"
-            op_dialect_name = dialect_name + "." + op_name
-
-            # =================================== #
-            #    gen interface/trait list str     #
-            # =================================== #
-            op_interfaces_str = ""
-            if len(op_interfaces) > 0:
-                op_interfaces_str = "," + ",".join(op_interfaces)
-
-            if op_name[-1] == "_":
-                op_traits += ["paddle::dialect::InplaceTrait"]
-
-            op_traits_str = ""
-            if len(op_traits) > 0:
-                op_traits_str = "," + ",".join(op_traits)
-
-            # =================================== #
-            #  gen get input/output methods str   #
-            # =================================== #
-            op_get_inputs_outputs_str = gen_op_get_inputs_outputs_str(
-                op_input_name_list,
-                op_mutable_attribute_name_list,
-                op_output_name_list,
-            )
+            if op_kernel_map is None:
+                func_list = [None]
+            else:
+                func_list = op_kernel_map['func']
 
-            # =================================== #
-            #         gen Build methods str       #
-            # =================================== #
-            build_args_with_muta_attr_not_input_for_declare = ""
-            build_func_with_muta_attr_not_input = ""
-            build_mutable_attr_is_input = ""
-            build_func_with_muta_attr_is_input_with_attr_is_map = ""
-            build_attr_num_over_1 = ""
-            build_mutable_attr_is_input_attr_num_over_1 = ""
-            build_func_with_attr_is_map = ""
-            build_func_with_muta_attr_is_input = ""
-
-            if op_infer_meta_map is not None:
-                (
-                    build_args_with_muta_attr_not_input_for_declare,
-                    build_func_with_muta_attr_not_input,
-                ) = gen_build_func_str(
-                    op_class_name,
+            for kernel_func_name in func_list:
+                if len(func_list) == 1:
+                    op_class_name = to_pascal_case(op_name) + "Op"
+                    op_dialect_name = dialect_name + "." + op_name
+                else:
+                    pascal_kernel_func_name = to_pascal_case(kernel_func_name)
+                    if op_name[-1] == "_":
+                        op_class_name = pascal_kernel_func_name + "_Op"
+                        op_dialect_name = (
+                            dialect_name + "." + kernel_func_name + "_"
+                        )
+                    else:
+                        op_class_name = pascal_kernel_func_name + "Op"
+                        op_dialect_name = dialect_name + "." + kernel_func_name
+
+                if kernel_func_name is None:
+                    op_input_type_list = op_info.input_type_dict['default']
+                    op_output_type_list = op_info.output_type_dict['default']
+                else:
+                    op_input_type_list = op_info.input_type_dict[
+                        kernel_func_name
+                    ]
+                    op_output_type_list = op_info.output_type_dict[
+                        kernel_func_name
+                    ]
+
+                # =================================== #
+                #    gen interface/trait list str     #
+                # =================================== #
+                op_interfaces_str = ""
+                if len(op_interfaces) > 0:
+                    op_interfaces_str = "," + ",".join(op_interfaces)
+
+                if op_name[-1] == "_":
+                    op_traits += ["paddle::dialect::InplaceTrait"]
+
+                op_traits_str = ""
+                if len(op_traits) > 0:
+                    op_traits_str = "," + ",".join(op_traits)
+
+                # =================================== #
+                #  gen get input/output methods str   #
+                # =================================== #
+                op_get_inputs_outputs_str = gen_op_get_inputs_outputs_str(
                     op_input_name_list,
-                    op_input_type_list,
-                    op_input_optional_list,
-                    op_attribute_name_list,
-                    op_attribute_type_list,
-                    op_attribute_build_arg_type_list,
-                    op_attribute_default_value_list,
                     op_mutable_attribute_name_list,
-                    op_mutable_attribute_type_list,
-                    op_non_mutable_attribute_name_list,
-                    op_non_mutable_attribute_type_list,
-                    op_non_mutable_attribute_build_arg_type_list,
-                    op_non_mutable_attribute_default_value_list,
                     op_output_name_list,
-                    op_output_type_list,
-                    op_output_size_list,
-                    op_output_optional_list,
-                    op_infer_meta_map,
-                    op_inplace_map,
-                    muta_attr_is_input=False,
                 )
-                if len(op_attribute_name_list) > 0:
-                    (
-                        build_args_with_attr_is_map_for_declare,
-                        build_func_with_attr_is_map,
-                    ) = gen_build_func_str(
-                        op_class_name,
-                        op_input_name_list,
-                        op_input_type_list,
-                        op_input_optional_list,
-                        op_attribute_name_list,
-                        op_attribute_type_list,
-                        op_attribute_build_arg_type_list,
-                        op_attribute_default_value_list,
-                        op_mutable_attribute_name_list,
-                        op_mutable_attribute_type_list,
-                        op_non_mutable_attribute_name_list,
-                        op_non_mutable_attribute_type_list,
-                        op_non_mutable_attribute_build_arg_type_list,
-                        op_non_mutable_attribute_default_value_list,
-                        op_output_name_list,
-                        op_output_type_list,
-                        op_output_size_list,
-                        op_output_optional_list,
-                        op_infer_meta_map,
-                        op_inplace_map,
-                        muta_attr_is_input=False,
-                        attr_args_is_map=True,
-                    )
-                    build_attr_num_over_1 = f"static void Build({build_args_with_attr_is_map_for_declare});"
 
-                if len(op_mutable_attribute_name_list) > 0:
+                # =================================== #
+                #         gen Build methods str       #
+                # =================================== #
+                build_args_with_muta_attr_not_input_for_declare = ""
+                build_func_with_muta_attr_not_input = ""
+                build_mutable_attr_is_input = ""
+                build_func_with_muta_attr_is_input_with_attr_is_map = ""
+                build_attr_num_over_1 = ""
+                build_mutable_attr_is_input_attr_num_over_1 = ""
+                build_func_with_attr_is_map = ""
+                build_func_with_muta_attr_is_input = ""
+
+                if op_infer_meta_map is not None:
                     (
-                        build_args_with_muta_attr_is_input_for_declare,
-                        build_func_with_muta_attr_is_input,
+                        build_args_with_muta_attr_not_input_for_declare,
+                        build_func_with_muta_attr_not_input,
                     ) = gen_build_func_str(
                         op_class_name,
                         op_input_name_list,
@@ -1073,17 +1147,12 @@ def OpGenerator(
                         op_output_optional_list,
                         op_infer_meta_map,
                         op_inplace_map,
-                        muta_attr_is_input=True,
-                    )
-
-                    build_mutable_attr_is_input = "static void Build({build_args});".format(
-                        build_args=build_args_with_muta_attr_is_input_for_declare
+                        muta_attr_is_input=False,
                     )
-
-                    if len(op_non_mutable_attribute_name_list) > 0:
+                    if len(op_attribute_name_list) > 0:
                         (
-                            build_args_with_muta_attr_is_input_with_attr_is_map_for_declare,
-                            build_func_with_muta_attr_is_input_with_attr_is_map,
+                            build_args_with_attr_is_map_for_declare,
+                            build_func_with_attr_is_map,
                         ) = gen_build_func_str(
                             op_class_name,
                             op_input_name_list,
@@ -1105,263 +1174,333 @@ def OpGenerator(
                             op_output_optional_list,
                             op_infer_meta_map,
                             op_inplace_map,
-                            muta_attr_is_input=True,
+                            muta_attr_is_input=False,
                             attr_args_is_map=True,
                         )
+                        build_attr_num_over_1 = f"static void Build({build_args_with_attr_is_map_for_declare});"
 
-                        build_mutable_attr_is_input_attr_num_over_1 = "static void Build({build_args});".format(
-                            build_args=build_args_with_muta_attr_is_input_with_attr_is_map_for_declare
+                    if len(op_mutable_attribute_name_list) > 0:
+                        (
+                            build_args_with_muta_attr_is_input_for_declare,
+                            build_func_with_muta_attr_is_input,
+                        ) = gen_build_func_str(
+                            op_class_name,
+                            op_input_name_list,
+                            op_input_type_list,
+                            op_input_optional_list,
+                            op_attribute_name_list,
+                            op_attribute_type_list,
+                            op_attribute_build_arg_type_list,
+                            op_attribute_default_value_list,
+                            op_mutable_attribute_name_list,
+                            op_mutable_attribute_type_list,
+                            op_non_mutable_attribute_name_list,
+                            op_non_mutable_attribute_type_list,
+                            op_non_mutable_attribute_build_arg_type_list,
+                            op_non_mutable_attribute_default_value_list,
+                            op_output_name_list,
+                            op_output_type_list,
+                            op_output_size_list,
+                            op_output_optional_list,
+                            op_infer_meta_map,
+                            op_inplace_map,
+                            muta_attr_is_input=True,
                         )
 
-            if (op_invoke_map is not None) and (
-                op_invoke_map['func'] in op_info_items
-            ):
-                op_invoke_class_name = (
-                    to_pascal_case(op_invoke_map['func']) + "Op"
-                )
-
-                (
-                    build_args_with_muta_attr_not_input_for_declare,
-                    build_func_with_muta_attr_not_input,
-                ) = gen_build_func_str_by_invoke(
-                    op_class_name,
-                    op_input_name_list,
-                    op_input_type_list,
-                    op_input_optional_list,
-                    op_attribute_name_list,
-                    op_attribute_type_list,
-                    op_attribute_build_arg_type_list,
-                    op_attribute_default_value_list,
-                    op_mutable_attribute_name_list,
-                    op_mutable_attribute_type_list,
-                    op_non_mutable_attribute_name_list,
-                    op_non_mutable_attribute_type_list,
-                    op_non_mutable_attribute_build_arg_type_list,
-                    op_non_mutable_attribute_default_value_list,
-                    op_invoke_class_name,
-                    op_invoke_map,
-                )
+                        build_mutable_attr_is_input = "static void Build({build_args});".format(
+                            build_args=build_args_with_muta_attr_is_input_for_declare
+                        )
+                if (op_invoke_map is not None) and (
+                    op_invoke_map['func'] in op_info_items
+                ):
+                    op_invoke_class_name = (
+                        to_pascal_case(op_invoke_map['func']) + "Op"
+                    )
 
-            # gen op_declare_str/op_defined_str
-            if len(op_non_mutable_attribute_name_list) == 0:
-                op_declare_str = OP_DECLARE_TEMPLATE.format(
-                    op_name=op_class_name,
-                    dialect_op_name=op_dialect_name,
-                    interfaces=op_interfaces_str,
-                    traits=op_traits_str,
-                    attribute_declare=op_0_attribute_declare_str,
-                    attribute_num=0,
-                    build_args=build_args_with_muta_attr_not_input_for_declare,
-                    build_mutable_attr_is_input=build_mutable_attr_is_input,
-                    build_attr_num_over_1=build_attr_num_over_1,
-                    build_mutable_attr_is_input_attr_num_over_1=build_mutable_attr_is_input_attr_num_over_1,
-                    get_inputs_and_outputs=op_get_inputs_outputs_str,
-                    exclusive_interface=exclusive_interface_str,
-                )
-                op_defined_str = ""
-            else:
-                op_declare_str = OP_DECLARE_TEMPLATE.format(
-                    op_name=op_class_name,
-                    dialect_op_name=op_dialect_name,
-                    interfaces=op_interfaces_str,
-                    traits=op_traits_str,
-                    attribute_declare=op_n_attribute_declare_str.format(
-                        attribute_num=len(op_non_mutable_attribute_name_list)
-                    ),
-                    attribute_num=len(op_non_mutable_attribute_name_list),
-                    build_args=build_args_with_muta_attr_not_input_for_declare,
-                    build_mutable_attr_is_input=build_mutable_attr_is_input,
-                    build_attr_num_over_1=build_attr_num_over_1,
-                    build_mutable_attr_is_input_attr_num_over_1=build_mutable_attr_is_input_attr_num_over_1,
-                    get_inputs_and_outputs=op_get_inputs_outputs_str,
-                    exclusive_interface=exclusive_interface_str,
-                )
-                attribute_names_str = (
-                    '"' + '", "'.join(op_non_mutable_attribute_name_list) + '"'
-                )
-                op_defined_str = OP_N_ATTRIBUTE_DEFINED_TEMPLATE.format(
-                    op_name=op_class_name,
-                    attribute_num=len(op_non_mutable_attribute_name_list),
-                    attribute_names=attribute_names_str,
-                )
+                    (
+                        build_args_with_muta_attr_not_input_for_declare,
+                        build_func_with_muta_attr_not_input,
+                    ) = gen_build_func_str_by_invoke(
+                        op_class_name,
+                        op_input_name_list,
+                        op_input_type_list,
+                        op_input_optional_list,
+                        op_attribute_name_list,
+                        op_attribute_type_list,
+                        op_attribute_build_arg_type_list,
+                        op_attribute_default_value_list,
+                        op_mutable_attribute_name_list,
+                        op_mutable_attribute_type_list,
+                        op_non_mutable_attribute_name_list,
+                        op_non_mutable_attribute_type_list,
+                        op_non_mutable_attribute_build_arg_type_list,
+                        op_non_mutable_attribute_default_value_list,
+                        op_invoke_class_name,
+                        op_invoke_map,
+                    )
 
-            # =================================== #
-            #         gen GetOpInfo func str      #
-            # =================================== #
-            # generate get op info funciton: inputs
-            input_info_list = []
-            for idx in range(len(op_input_name_list)):
-                input_info_list.append(
-                    CONSTRUCT_INPUT_INFO_TEMPLATE.format(
-                        name=op_input_name_list[idx],
-                        typename=op_input_type_list[idx],
-                        optional=op_input_optional_list[idx],
-                        no_need_buffer=op_input_no_need_buffer_list[idx],
-                        is_mutable_attribute='false',
-                        with_grad_semantic=input_grad_semantics[idx],
+                # gen op_declare_str/op_defined_str
+                if len(op_non_mutable_attribute_name_list) == 0:
+                    op_declare_str = OP_DECLARE_TEMPLATE.format(
+                        op_name=op_class_name,
+                        dialect_op_name=op_dialect_name,
+                        interfaces=op_interfaces_str,
+                        traits=op_traits_str,
+                        attribute_declare=op_0_attribute_declare_str,
+                        attribute_num=0,
+                        build_args=build_args_with_muta_attr_not_input_for_declare,
+                        build_mutable_attr_is_input=build_mutable_attr_is_input,
+                        build_attr_num_over_1=build_attr_num_over_1,
+                        build_mutable_attr_is_input_attr_num_over_1=build_mutable_attr_is_input_attr_num_over_1,
+                        get_inputs_and_outputs=op_get_inputs_outputs_str,
+                        exclusive_interface=exclusive_interface_str,
                     )
-                )
-            for idx in range(len(op_mutable_attribute_name_list)):
-                input_info_list.append(
-                    CONSTRUCT_INPUT_INFO_TEMPLATE.format(
-                        name=op_mutable_attribute_name_list[idx],
-                        typename=op_mutable_attribute_type_list[idx][0],
-                        optional='false',
-                        no_need_buffer='false',
-                        is_mutable_attribute='true',
-                        with_grad_semantic=mutable_attribute_grad_semantics[
-                            idx
-                        ],
+                    op_defined_str = ""
+                else:
+                    op_declare_str = OP_DECLARE_TEMPLATE.format(
+                        op_name=op_class_name,
+                        dialect_op_name=op_dialect_name,
+                        interfaces=op_interfaces_str,
+                        traits=op_traits_str,
+                        attribute_declare=op_n_attribute_declare_str.format(
+                            attribute_num=len(
+                                op_non_mutable_attribute_name_list
+                            )
+                        ),
+                        attribute_num=len(op_non_mutable_attribute_name_list),
+                        build_args=build_args_with_muta_attr_not_input_for_declare,
+                        build_mutable_attr_is_input=build_mutable_attr_is_input,
+                        build_attr_num_over_1=build_attr_num_over_1,
+                        build_mutable_attr_is_input_attr_num_over_1=build_mutable_attr_is_input_attr_num_over_1,
+                        get_inputs_and_outputs=op_get_inputs_outputs_str,
+                        exclusive_interface=exclusive_interface_str,
                     )
-                )
-            if len(input_info_list) > 0:
-                inputs_info_str = ", ".join(input_info_list)
-            else:
-                inputs_info_str = ""
-            # generate get op info funciton: outputs
-            outputs_info_str = ""
-            if len(op_output_name_list) > 0:
-                output_info_list = []
-                for idx in range(len(op_output_name_list)):
-                    output_info_list.append(
-                        CONSTRUCT_OUTPUT_INFO_TEMPLATE.format(
-                            name=op_output_name_list[idx],
-                            typename=op_output_type_list[idx],
-                            optional=op_output_optional_list[idx],
-                            intermediate=op_output_intermediate_list[idx],
+                    attribute_names_str = (
+                        '"'
+                        + '", "'.join(op_non_mutable_attribute_name_list)
+                        + '"'
+                    )
+                    op_defined_str = OP_N_ATTRIBUTE_DEFINED_TEMPLATE.format(
+                        op_name=op_class_name,
+                        attribute_num=len(op_non_mutable_attribute_name_list),
+                        attribute_names=attribute_names_str,
+                    )
+
+                # =================================== #
+                #         gen GetOpInfo func str      #
+                # =================================== #
+                # generate get op info funciton: inputs
+                input_info_list = []
+                for idx in range(len(op_input_name_list)):
+                    input_info_list.append(
+                        CONSTRUCT_INPUT_INFO_TEMPLATE.format(
+                            name=op_input_name_list[idx],
+                            typename=op_input_type_list[idx],
+                            optional=op_input_optional_list[idx],
+                            no_need_buffer=op_input_no_need_buffer_list[idx],
+                            is_mutable_attribute='false',
+                            with_grad_semantic=input_grad_semantics[idx],
                         )
                     )
-                outputs_info_str = ", ".join(output_info_list)
-            # generate get op info funciton: attributes
-            attribute_info_str = ""
-            if len(op_non_mutable_attribute_name_list) > 0:
-                attribute_info_list = []
-                for idx in range(len(op_non_mutable_attribute_name_list)):
-                    attribute_info_list.append(
-                        CONSTRUCT_ATTRIBUTE_INFO_TEMPLATE.format(
-                            name=op_non_mutable_attribute_name_list[idx],
-                            typename=op_non_mutable_attribute_type_list[idx],
-                            data_type=op_non_mutable_attribute_data_type_list[
+                for idx in range(len(op_mutable_attribute_name_list)):
+                    input_info_list.append(
+                        CONSTRUCT_INPUT_INFO_TEMPLATE.format(
+                            name=op_mutable_attribute_name_list[idx],
+                            typename=op_mutable_attribute_type_list[idx][0],
+                            optional='false',
+                            no_need_buffer='false',
+                            is_mutable_attribute='true',
+                            with_grad_semantic=mutable_attribute_grad_semantics[
                                 idx
                             ],
                         )
                     )
-                attribute_info_str = ", ".join(attribute_info_list)
-            # generate runtiem info
-            infer_meta_func_str = ""
-            infer_meta_param_str = ""
-            if op_infer_meta_map is not None:
-                infer_meta_func_str = op_infer_meta_map['func']
-                infer_meta_param_str = '", "'.join(op_infer_meta_map['param'])
-
-            kernel_func_str = ""
-            kernel_param_str = ""
-            kernel_key_dtype = ""
-            kernel_key_backend = ""
-            if op_kernel_map is not None:
-                kernel_func_str = '", "'.join(op_kernel_map['func'])
-                kernel_param_str = '", "'.join(op_kernel_map['param'])
-                if 'data_type' in op_kernel_map and op_kernel_map['data_type']:
-                    kernel_key_dtype = '", "'.join(
-                        op_kernel_map['data_type']['candidates']
-                    )
-                    if kernel_key_dtype != "":
-                        kernel_key_dtype = '"' + kernel_key_dtype + '"'
-                if 'backend' in op_kernel_map and op_kernel_map['backend']:
-                    kernel_key_backend = '", "'.join(
-                        op_kernel_map['backend']['candidates']
+                if len(input_info_list) > 0:
+                    inputs_info_str = ", ".join(input_info_list)
+                else:
+                    inputs_info_str = ""
+                # generate get op info funciton: outputs
+                outputs_info_str = ""
+                if len(op_output_name_list) > 0:
+                    output_info_list = []
+                    for idx in range(len(op_output_name_list)):
+                        output_info_list.append(
+                            CONSTRUCT_OUTPUT_INFO_TEMPLATE.format(
+                                name=op_output_name_list[idx],
+                                typename=op_output_type_list[idx],
+                                optional=op_output_optional_list[idx],
+                                intermediate=op_output_intermediate_list[idx],
+                            )
+                        )
+                    outputs_info_str = ", ".join(output_info_list)
+                # generate get op info funciton: attributes
+                attribute_info_str = ""
+                if len(op_non_mutable_attribute_name_list) > 0:
+                    attribute_info_list = []
+                    for idx in range(len(op_non_mutable_attribute_name_list)):
+                        attribute_info_list.append(
+                            CONSTRUCT_ATTRIBUTE_INFO_TEMPLATE.format(
+                                name=op_non_mutable_attribute_name_list[idx],
+                                typename=op_non_mutable_attribute_type_list[
+                                    idx
+                                ],
+                                data_type=op_non_mutable_attribute_data_type_list[
+                                    idx
+                                ],
+                            )
+                        )
+                    attribute_info_str = ", ".join(attribute_info_list)
+                # generate runtiem info
+                infer_meta_func_str = ""
+                infer_meta_param_str = ""
+                if op_infer_meta_map is not None:
+                    infer_meta_func_str = op_infer_meta_map['func']
+                    infer_meta_param_str = '", "'.join(
+                        op_infer_meta_map['param']
                     )
-                    if kernel_key_backend != "":
-                        kernel_key_backend = '"' + kernel_key_backend + '"'
-
-            inplace_str = ""
-            view_str = ""
-            if op_name[-1] == "_":
-                if op_inplace_map is not None:
-                    for key, value in op_inplace_map.items():
-                        inplace_str += '{"' + key + '", "' + value + '"},'
-                    inplace_str = inplace_str[:-1]
-                if op_view_map is not None:
-                    for key, value in op_view_map.items():
-                        view_str += '{"' + key + '", "' + value + '"},'
-                    view_str = view_str[:-1]
-
-            op_info_func_str = OP_INFO_TEMPLATE.format(
-                op_name=op_class_name,
-                inputs=inputs_info_str,
-                attributes=attribute_info_str,
-                outputs=outputs_info_str,
-                infer_meta_func=infer_meta_func_str,
-                infer_meta_param=infer_meta_param_str,
-                kernel_func=kernel_func_str,
-                kernel_param=kernel_param_str,
-                kernel_key_dtype=kernel_key_dtype,
-                kernel_key_backend=kernel_key_backend,
-                inplace=inplace_str,
-                view=view_str,
-                origin_op_name=op_info.op_yaml_item['name'],
-            )
 
-            # generate op verify function str
-            op_verify_str = ''
-            if not op_info.custom_verify:
-                op_verify_str = gen_verify_func_str(
-                    op_class_name,
-                    op_input_type_list,
-                    op_input_optional_list,
-                    op_mutable_attribute_name_list,
-                    op_mutable_attribute_type_list,
-                    op_non_mutable_attribute_name_list,
-                    op_non_mutable_attribute_type_list,
-                    op_output_type_list,
-                    op_output_optional_list,
+                kernel_func_str = ""
+                kernel_param_str = ""
+                kernel_key_dtype = ""
+                kernel_key_backend = ""
+                if op_kernel_map is not None:
+                    kernel_func_str = kernel_func_name
+                    kernel_param_str = '", "'.join(op_kernel_map['param'])
+                    if (
+                        'data_type' in op_kernel_map
+                        and op_kernel_map['data_type']
+                    ):
+                        kernel_key_dtype = '", "'.join(
+                            op_kernel_map['data_type']['candidates']
+                        )
+                        if kernel_key_dtype != "":
+                            kernel_key_dtype = '"' + kernel_key_dtype + '"'
+                    if 'backend' in op_kernel_map and op_kernel_map['backend']:
+                        kernel_key_backend = '", "'.join(
+                            op_kernel_map['backend']['candidates']
+                        )
+                        if kernel_key_backend != "":
+                            kernel_key_backend = '"' + kernel_key_backend + '"'
+
+                inplace_str = ""
+                view_str = ""
+                if op_name[-1] == "_":
+                    if op_inplace_map is not None:
+                        for key, value in op_inplace_map.items():
+                            inplace_str += '{"' + key + '", "' + value + '"},'
+                        inplace_str = inplace_str[:-1]
+                    if op_view_map is not None:
+                        for key, value in op_view_map.items():
+                            view_str += '{"' + key + '", "' + value + '"},'
+                        view_str = view_str[:-1]
+
+                op_info_func_str = OP_INFO_TEMPLATE.format(
+                    op_name=op_class_name,
+                    inputs=inputs_info_str,
+                    attributes=attribute_info_str,
+                    outputs=outputs_info_str,
+                    infer_meta_func=infer_meta_func_str,
+                    infer_meta_param=infer_meta_param_str,
+                    kernel_func=kernel_func_str,
+                    kernel_param=kernel_param_str,
+                    kernel_key_dtype=kernel_key_dtype,
+                    kernel_key_backend=kernel_key_backend,
+                    inplace=inplace_str,
+                    view=view_str,
+                    origin_op_name=op_info.op_yaml_item['name'],
                 )
 
-            op_infer_meta_str = gen_op_infer_meta_str(
-                op_info, op_class_name, op_info_items
-            )
+                # generate op verify function str
+                op_verify_str = ''
+                if not op_info.custom_verify:
+                    op_verify_str = gen_verify_func_str(
+                        op_class_name,
+                        op_input_type_list,
+                        op_input_optional_list,
+                        op_mutable_attribute_name_list,
+                        op_mutable_attribute_type_list,
+                        op_non_mutable_attribute_name_list,
+                        op_non_mutable_attribute_type_list,
+                        op_output_type_list,
+                        op_output_optional_list,
+                    )
 
-            # =================================== #
-            #         gen Vjp func str      #
-            # =================================== #
+                op_infer_meta_str = gen_op_infer_meta_str(
+                    op_info, op_class_name, op_info_items
+                )
 
-            # generate op vjp function str
+                # =================================== #
+                #         gen Vjp func str      #
+                # =================================== #
 
-            op_vjp_str = ''
-            if dialect_name == "cinn":
-                logging.warning("cinn is currently not support Vjp function")
-            else:
-                if (
-                    op_info.backward_name
-                    and op_info.op_phi_name[0]
-                    in vjp_interface_implementation_gen_op_list
-                ):
-                    op_vjp_str = gen_op_vjp_str(
-                        op_class_name,
-                        op_info.backward_name,
-                        op_name,
-                        op_info_items[op_info.op_phi_name[0]],
-                        op_info_items[op_info.backward_name],
+                # generate op vjp function str
+                op_vjp_str = ''
+                if dialect_name == "cinn":
+                    logging.warning(
+                        "cinn is currently not support Vjp function"
                     )
+                else:
+                    if (
+                        op_info.backward_name
+                        and op_info.op_phi_name[0]
+                        in vjp_interface_implementation_gen_op_list
+                    ):
+                        op_vjp_str = gen_op_vjp_str(
+                            op_class_name,
+                            op_info.backward_name,
+                            op_name,
+                            op_info_items[op_info.op_phi_name[0]],
+                            op_info_items[op_info.backward_name],
+                        )
+
+                    ops_name_list.append(op_class_name)
+                    ops_declare_list.append(op_declare_str)
+                    ops_defined_list.append(op_defined_str)
+                    ops_defined_list.append(op_info_func_str)
+                    ops_defined_list.append(build_func_with_muta_attr_not_input)
+                    ops_defined_list.append(build_func_with_attr_is_map)
+                    if len(op_mutable_attribute_name_list) > 0:
+                        ops_defined_list.append(
+                            build_func_with_muta_attr_is_input
+                        )
+                        ops_defined_list.append(
+                            build_func_with_muta_attr_is_input_with_attr_is_map
+                        )
 
-            ops_name_list.append(op_class_name)
-            ops_declare_list.append(op_declare_str)
-            ops_defined_list.append(op_defined_str)
-            ops_defined_list.append(op_info_func_str)
-            ops_defined_list.append(build_func_with_muta_attr_not_input)
-            ops_defined_list.append(build_func_with_attr_is_map)
-            if len(op_mutable_attribute_name_list) > 0:
-                ops_defined_list.append(build_func_with_muta_attr_is_input)
-                ops_defined_list.append(
-                    build_func_with_muta_attr_is_input_with_attr_is_map
+                    ops_defined_list.append(op_verify_str)
+                    ops_defined_list.append(op_infer_meta_str)
+                    # NOTE(chenxi67)skip if dialect_name==cinn
+                    if dialect_name == "cinn":
+                        pass
+                    else:
+                        ops_vjp_defined_list.append(op_vjp_str)
+
+            if op_kernel_map is not None and len(op_kernel_map['func']) > 1:
+                OP_TO_MULTI_KERNELS_MAP_ITEM = (
+                    """{{"{op_name}", {{{sig_list}}}}}"""
                 )
-            ops_defined_list.append(op_verify_str)
-            ops_defined_list.append(op_infer_meta_str)
-            # NOTE(chenxi67)skip if dialect_name==cinn
-            if dialect_name == "cinn":
-                pass
-            else:
-                ops_vjp_defined_list.append(op_vjp_str)
+                OP_TO_MULTI_KERNELS_MAP_ITEM_SIG = """paddle::dialect::PdOpSig("{kernel_name}", {{{inputs}}}, {{{outputs}}})"""
+                op_to_multi_kernels_sig_list = []
+                for kernel_func_name in op_kernel_map['func']:
+                    inputs = op_kernel_map['dispatch'][kernel_func_name][0]
+                    outputs = op_kernel_map['dispatch'][kernel_func_name][1]
+                    inputs = '"' + '", "'.join(inputs) + '"'
+                    outputs = '"' + '", "'.join(outputs) + '"'
+                    if op_name[-1] == "_":
+                        kernel_func_name = kernel_func_name + "_"
+
+                    op_to_multi_kernels_sig_list.append(
+                        OP_TO_MULTI_KERNELS_MAP_ITEM_SIG.format(
+                            kernel_name=kernel_func_name,
+                            inputs=inputs,
+                            outputs=outputs,
+                        )
+                    )
+                op_to_multi_kernels_str = OP_TO_MULTI_KERNELS_MAP_ITEM.format(
+                    op_name=op_name,
+                    sig_list=", ".join(op_to_multi_kernels_sig_list),
+                )
+                op_to_multi_kernels_list.append(op_to_multi_kernels_str)
 
     # (4) Generate head file str
     op_namespaces_prev = ""
@@ -1380,12 +1519,22 @@ def OpGenerator(
 
     head_file_str = ""
     head_file_str += "".join(ops_declare_list)  # Add op class
+    if dialect_name == "pd_op":
+        op_to_multi_kernels_map = OP_TO_MULTI_KERNELS_MAP_H
+        for name in reversed(namespaces):
+            op_to_multi_kernels_map = NAMESPACE_GARD_TEMPLATE.format(
+                namespace=name, input=op_to_multi_kernels_map
+            )  # Add namespaces
+    else:
+        op_to_multi_kernels_map = ""
+
     for name in reversed(namespaces):
         head_file_str = NAMESPACE_GARD_TEMPLATE.format(
             namespace=name, input=head_file_str
         )  # Add namespaces
     head_file_str = H_FILE_TEMPLATE.format(
         op_declare=op_list_str,
+        op_to_multi_kernels_map=op_to_multi_kernels_map,
         input=head_file_str,
         declare_type_id=declare_type_id_str,
     )  # Add head
@@ -1401,7 +1550,19 @@ def OpGenerator(
     for op in ops_name_with_namespace_list:
         define_type_id_str += DEFINE_OP_TYPE_ID.format(op_name=op)
 
+    if dialect_name == "pd_op":
+        op_to_multi_kernels_map_str = OP_TO_MULTI_KERNELS_MAPS.format(
+            maps=", \r".join(op_to_multi_kernels_list)
+        )
+        for name in reversed(namespaces):
+            op_to_multi_kernels_map_str = NAMESPACE_GARD_TEMPLATE.format(
+                namespace=name, input=op_to_multi_kernels_map_str
+            )  # Add namespaces
+    else:
+        op_to_multi_kernels_map_str = ""
+
     source_file_str = CC_FILE_TEMPLATE.format(
+        op_to_multi_kernels_map=op_to_multi_kernels_map_str,
         h_file=op_def_h_file[:-4],
         input=source_file_str,
         define_type_id=define_type_id_str,
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index 553a7fc1732b7..bb9c8474a373f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -44,7 +44,7 @@ OpInfoTuple AddNOp::GetOpInfo() {
   std::vector<paddle::dialect::OpOutputInfo> outputs = {
       OpOutputInfo("out", "paddle::dialect::DenseTensorType", false, false)};
   paddle::dialect::OpRunTimeInfo run_time_info = OpRunTimeInfo(
-      "AddNInferMeta", {"inputs"}, {"add_n"}, {"inputs"}, {}, {}, {}, {});
+      "AddNInferMeta", {"inputs"}, "add_n", {"inputs"}, {}, {}, {}, {});
 
   return std::make_tuple(inputs, attributes, outputs, run_time_info, "add_n");
 }
@@ -173,7 +173,7 @@ OpInfoTuple AddN_Op::GetOpInfo() {
       paddle::dialect::OpOutputInfo(
           "out", "paddle::dialect::DenseTensorType", false, false)};
   paddle::dialect::OpRunTimeInfo run_time_info = paddle::dialect::OpRunTimeInfo(
-      "AddNInferMeta", {"inputs"}, {"add_n"}, {"inputs"}, {}, {}, {}, {});
+      "AddNInferMeta", {"inputs"}, "add_n", {"inputs"}, {}, {}, {}, {});
   return std::make_tuple(inputs, attributes, outputs, run_time_info, "add_n_");
 }
 
@@ -301,7 +301,7 @@ OpInfoTuple AddNWithKernelOp::GetOpInfo() {
       paddle::dialect::OpOutputInfo(
           "out", "paddle::dialect::DenseTensorType", false, false)};
   paddle::dialect::OpRunTimeInfo run_time_info = paddle::dialect::OpRunTimeInfo(
-      "AddNInferMeta", {"inputs"}, {"add_n"}, {"inputs"}, {}, {}, {}, {});
+      "AddNInferMeta", {"inputs"}, "add_n", {"inputs"}, {}, {}, {}, {});
   return std::make_tuple(
       inputs, attributes, outputs, run_time_info, "add_n_with_kernel");
 }
@@ -444,7 +444,7 @@ OpInfoTuple FusedGemmEpilogueOp::GetOpInfo() {
   paddle::dialect::OpRunTimeInfo run_time_info(
       "FusedGemmEpilogueInferMeta",
       {"x", "y", "bias", "trans_x", "trans_y", "activation"},
-      {""},
+      "",
       {""},
       {""},
       {},
@@ -698,7 +698,7 @@ OpInfoTuple FusedGemmEpilogueGradOp::GetOpInfo() {
                                                 "trans_x",
                                                 "trans_y",
                                                 "activation_grad"},
-                                               {""},
+                                               "",
                                                {""},
                                                {""},
                                                {},
@@ -900,7 +900,7 @@ OpInfoTuple SplitGradOp::GetOpInfo() {
   paddle::dialect::OpRunTimeInfo run_time_info =
       OpRunTimeInfo("ConcatInferMeta",
                     {"out_grad", "axis"},
-                    {"concat"},
+                    "concat",
                     {"out_grad", "axis"},
                     {"out_grad"},
                     {},
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index d3cbc31c2e490..89f8ff91d1e6a 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -65,6 +65,12 @@
     func : fetch
     param : [x]
 
+- op : get_tensor_from_selected_rows
+  args : (Tensor x)
+  output : Tensor(out)
+  kernel:
+    func: get_tensor_from_selected_rows {selected_rows -> dense}
+
 - op : load_combine
   args : (str file_path, bool load_as_fp16, bool model_from_memory)
   output : Tensor[](Out)
diff --git a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
index 462e88f4da327..637de470675eb 100644
--- a/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
@@ -87,7 +87,7 @@ struct OpAttributeInfo {
 struct OpRunTimeInfo {
   std::string infer_meta_func;
   std::vector<std::string> infer_meta_param;
-  std::vector<std::string> kernel_func;
+  std::string kernel_func;
   std::vector<std::string> kernel_param;
   std::vector<std::string> kernel_key_dtype;
   std::vector<std::string> kernel_key_backend;
@@ -95,7 +95,7 @@ struct OpRunTimeInfo {
   std::vector<std::pair<std::string, std::string>> view;
   OpRunTimeInfo(const std::string& infer_meta_func,
                 const std::vector<std::string>& infer_meta_param,
-                const std::vector<std::string>& kernel_func,
+                const std::string& kernel_func,
                 const std::vector<std::string>& kernel_param,
                 const std::vector<std::string>& dtype,
                 const std::vector<std::string>& backend,
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 79e6bbe71230e..1088b2bedbaaf 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -357,9 +357,15 @@ phi::DataType GetKernelDataTypeByYamlInfo(
                 vec_data[0]
                     .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
                     .dtype());
+          } else if (vec_data[0]
+                         .isa<paddle::dialect::AllocatedSelectedRowsType>()) {
+            kernel_data_type = TransToPhiDataType(
+                vec_data[0]
+                    .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
+                    .dtype());
           } else {
             PADDLE_THROW(phi::errors::Unimplemented(
-                "Only support DenseTensorType in vector"));
+                "Only support DenseTensorType and SelectedRowsType in vector"));
           }
         }
       } else if (type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
@@ -880,12 +886,14 @@ std::vector<pir::Type> BuildOpOutputType(pir::Operation* op_item,
       auto base_types = result_type.dyn_cast<pir::VectorType>().data();
       for (auto& base_type : base_types) {
         if (base_type) {
-          if (base_type.isa<dialect::DenseTensorType>()) {
+          if (base_type.isa<dialect::DenseTensorType>() ||
+              base_type.isa<dialect::SelectedRowsType>()) {
             vec_inner_types.push_back(
                 BuildOutputType(base_type, out_place, out_phi_dtype, ctx));
           } else {
             PADDLE_THROW(phi::errors::Unimplemented(
-                "only support dense tensor in vector type for now"));
+                "only support dense tensor and selected rows in vector type "
+                "for now"));
           }
         } else {
           // NOTE(phlrain), kernel not support a nullptr in output
@@ -1075,9 +1083,43 @@ std::vector<pir::Value> BuildOpInputList(
             block->push_back(operation);
           }
         }
-
       } else if (new_in_type.isa<dialect::AllocatedSelectedRowsType>()) {
-        // do nothing here
+        // allocated type
+        auto in_place =
+            new_in_type.dyn_cast<dialect::AllocatedSelectedRowsType>().place();
+
+        // get input args def type
+        auto args_def = kernel.args_def();
+        auto input_defs = args_def.input_defs();
+
+        auto dst_backend = GetDstBackend(op_item->name(),
+                                         place,
+                                         op_info_parser,
+                                         kernel.InputAt(i).backend,
+                                         i);
+
+        bool need_trans =
+            (in_place.GetType() != phi::AllocationType::UNDEFINED) &&
+            (paddle::experimental::NeedTransformPlace(
+                in_place, dst_backend, {}));
+        if (need_trans) {
+          VLOG(6) << "need trans from " << in_place << " to "
+                  << kernel_key.backend();
+          // build memcopy op
+          auto out_place = phi::TransToPhiPlace(dst_backend);
+          auto new_in_alloc_type =
+              new_in_type.dyn_cast<dialect::AllocatedSelectedRowsType>();
+          auto out_type = dialect::AllocatedSelectedRowsType::get(
+              ctx,
+              out_place,
+              new_in_alloc_type.dtype(),
+              new_in_alloc_type.dims(),
+              new_in_alloc_type.data_layout(),
+              new_in_alloc_type.lod(),
+              new_in_alloc_type.offset());
+          new_in = AddPlaceTransferOp(
+              new_in, out_type, in_place, out_place, kernel_key, block);
+        }
       } else {
         PADDLE_THROW(phi::errors::Unimplemented(
             "only support allocated dense tensor type for now"));
@@ -1157,7 +1199,7 @@ std::string GetKernelFnStr(const OpYamlInfoParser* op_info_parser,
                            pir::Operation* op_item) {
   std::string kernel_fn_str;
   if (op_info_parser != nullptr) {
-    kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0];
+    kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func;
   }
 
   if (op_item->isa<paddle::dialect::AddN_Op>() ||
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 63093631e4347..1187c35ee72e2 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -3158,6 +3158,12 @@
   outputs:
     {out: Out}
 
+- op: get_tensor_from_selected_rows
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
 - op: logspace
   inputs:
     {start: Start, stop: Stop, num: Num, base: Base}
@@ -3259,3 +3265,9 @@
     x : X
   outputs :
     out : Out
+
+- op: write_to_array
+  inputs :
+    {x: X, i: I}
+  outputs :
+    out : Out
diff --git a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
index d52299ec3e6ac..adfe431a6be2b 100644
--- a/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
@@ -493,7 +493,7 @@ OpInfoTuple Conv2dFusionOpTest::GetOpInfo() {
                      "exhaustive_search",
                      "channels",
                      "user_workspace_size"},
-                    {"ConvFusionKernel"},
+                    "ConvFusionKernel",
                     {"input",
                      "filter",
                      "bias",
diff --git a/test/dygraph_to_static/test_new_ir_selectedrows.py b/test/dygraph_to_static/test_new_ir_selectedrows.py
new file mode 100644
index 0000000000000..13563d73b1753
--- /dev/null
+++ b/test/dygraph_to_static/test_new_ir_selectedrows.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+from dygraph_to_static_util import test_and_compare_with_new_ir
+
+import paddle
+from paddle.jit.api import to_static
+
+SEED = 102
+random.seed(SEED)
+
+
+class IRSelectedRowsTestNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.embedding = paddle.nn.Embedding(4, 3, sparse=False)
+
+        w0 = paddle.to_tensor(
+            [
+                [0.0, 0.0, 0.0],
+                [1.0, 1.0, 1.0],
+                [2.0, 2.0, 2.0],
+                [3.0, 3.0, 3.0],
+            ],
+            dtype="float32",
+        )
+        self.embedding.weight.set_value(w0)
+
+        self.linear = paddle.nn.Linear(
+            in_features=3,
+            out_features=3,
+            weight_attr=paddle.ParamAttr(need_clip=True),
+            bias_attr=paddle.ParamAttr(need_clip=False),
+        )
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear(x)
+        return x
+
+
+@to_static
+def train(net, adam, x):
+    loss_data = []
+    for i in range(10):
+        out = net(x)
+        loss = paddle.mean(out)
+        loss.backward()
+        adam.step()
+        adam.clear_grad()
+        loss_data.append(loss.numpy())
+    return loss_data
+
+
+def train_dygraph():
+    paddle.seed(100)
+    net = IRSelectedRowsTestNet()
+    x = paddle.to_tensor([[0], [1], [3]], dtype="int64", stop_gradient=False)
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    adam = paddle.optimizer.Adam(
+        parameters=net.parameters(), learning_rate=0.01, grad_clip=clip
+    )
+
+    paddle.jit.enable_to_static(False)
+    return train(net, adam, x)
+
+
+@test_and_compare_with_new_ir(True)
+def train_static():
+    paddle.seed(100)
+    net = IRSelectedRowsTestNet()
+    x = paddle.to_tensor([[0], [1], [3]], dtype="int64", stop_gradient=False)
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    adam = paddle.optimizer.Adam(
+        parameters=net.parameters(), learning_rate=0.01, grad_clip=clip
+    )
+
+    paddle.jit.enable_to_static(True)
+    return train(net, adam, x)
+
+
+class TestSimnet(unittest.TestCase):
+    def test_dygraph_static_same_loss(self):
+        dygraph_loss = train_dygraph()
+        static_loss = train_static()
+
+        self.assertEqual(len(dygraph_loss), len(static_loss))
+        for i in range(len(dygraph_loss)):
+            self.assertAlmostEqual(dygraph_loss[i], static_loss[i].numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()

From e9c2a9e2d3ba6273722e542044c66456c787eebe Mon Sep 17 00:00:00 2001
From: Chen Zhiyang <1792266893@qq.com>
Date: Fri, 22 Sep 2023 11:30:19 +0800
Subject: [PATCH 061/115] [PIR] Open PIR op_test for random create ops (#57556)

* first commit with some problems

* fix bug

* fix ci bug

* fix skip bf16 bug

* add gumbel_softmax
---
 .../op_generator/vjp_interface_gen_op_list.py |  4 +++
 paddle/fluid/primitive/codegen/gen.py         |  4 +++
 python/paddle/nn/functional/activation.py     |  2 +-
 python/paddle/tensor/creation.py              |  2 +-
 python/paddle/tensor/random.py                | 26 +++++++++++-------
 python/paddle/utils/layers_utils.py           |  2 --
 test/legacy_test/op_test.py                   | 13 ++++++---
 test/legacy_test/test_empty_op.py             | 11 +++++---
 test/legacy_test/test_exponential_op.py       |  4 +--
 test/legacy_test/test_gaussian_random_op.py   | 27 +++++++++++++------
 test/legacy_test/test_gumbel_softmax_op.py    | 12 ++++-----
 test/legacy_test/test_multinomial_op.py       |  8 ++++--
 test/legacy_test/test_poisson_op.py           |  7 +++--
 test/legacy_test/test_randint_op.py           |  9 ++++---
 test/legacy_test/test_randperm_op.py          |  2 +-
 15 files changed, 89 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
index 9998a40ec2c87..b454d3b961035 100644
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
@@ -55,6 +55,8 @@
     'slice',
     'transpose',
     'slice_double',
+    'poisson',
+    'gumbel_softmax',
 ]
 vjp_interface_implementation_gen_op_list = [
     "tanh",
@@ -88,4 +90,6 @@
     'slice',
     'transpose',
     'slice_double',
+    'poisson',
+    'gumbel_softmax',
 ]
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index e0eeeb10a3a4d..200b6a05b493f 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -69,6 +69,8 @@
     'layer_norm_grad',
     'embedding_grad',
     'scale_grad',
+    'poisson_grad',
+    'gumbel_softmax_grad',
 ]
 
 
@@ -161,6 +163,8 @@
     'embedding_grad',
     'sqrt',
     'uniform',
+    'poisson_grad',
+    'gumbel_softmax_grad',
     'split',
     'transpose',
 ]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 58364cc6d78a5..c9bc9cb0ebebc 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -1818,7 +1818,7 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
              [0.00000000, 1.        , 0.00000000, 0.00000000, 0.00000000, 0.00000000]])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.gumbel_softmax(x, temperature, hard, axis)
 
     helper = LayerHelper("gumbel_softmax", **locals())
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 3f543ea29d003..e71b7ff65a63a 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1959,7 +1959,7 @@ def empty(shape, dtype=None, name=None):
 
     dtype = convert_dtype(dtype)
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         shape = paddle.utils.convert_shape_to_list(shape)
         out = _C_ops.empty(
             shape, convert_np_dtype_to_dtype_(dtype), _current_expected_place()
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 46ee4ff6920b9..9333458844091 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -18,7 +18,11 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle.base.framework import _current_expected_place
 from paddle.common_ops_import import Variable
-from paddle.framework import in_dynamic_mode, in_dynamic_or_pir_mode
+from paddle.framework import (
+    in_dynamic_mode,
+    in_dynamic_or_pir_mode,
+    in_pir_mode,
+)
 
 from ..base.data_feeder import (
     check_dtype,
@@ -131,7 +135,7 @@ def poisson(x, name=None):
              [5., 1., 3.]])
             >>> # doctest: -SKIP
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.poisson(x)
     else:
         check_variable_and_dtype(x, "x", ["float32", "float64"], "poisson")
@@ -201,7 +205,7 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
 
     """
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.multinomial(x, num_samples, replacement)
     else:
         check_variable_and_dtype(
@@ -360,7 +364,7 @@ def gaussian(shape, mean=0.0, std=1.0, seed=0, dtype=None, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         shape = paddle.utils.convert_shape_to_list(shape)
         place = _current_expected_place()
         return _C_ops.gaussian(
@@ -618,7 +622,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
             [0.48646951, 0.00815189, 3.74022293])
             >>> # doctest: -SKIP
     """
-    if not in_dynamic_mode():
+    if not in_dynamic_or_pir_mode():
         check_type(mean, 'mean', (int, float, Variable), 'normal')
         check_type(std, 'std', (int, float, Variable), 'normal')
         if isinstance(mean, Variable):
@@ -656,7 +660,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         return gaussian(shape=shape, mean=mean, std=std, name=name)
 
     out = out * std + mean
-    if not in_dynamic_mode():
+    if not in_dynamic_or_pir_mode():
         out.stop_grediant = True
     return out
 
@@ -961,10 +965,14 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
         low = 0
     if dtype is None:
         dtype = core.VarDesc.VarType.INT64
+        if in_pir_mode():
+            from paddle.base.libpaddle import DataType
+
+            dtype = DataType.INT64
     elif not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         shape = paddle.utils.convert_shape_to_list(shape)
         place = _current_expected_place()
         return _C_ops.randint(low, high, shape, dtype, place)
@@ -1241,7 +1249,7 @@ def randperm(n, dtype="int64", name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.randperm(n, dtype, _current_expected_place())
     else:
         if n < 1:
@@ -1360,7 +1368,7 @@ def exponential_(x, lam=1.0, name=None):
             >>> # doctest: -SKIP
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.exponential_(x, lam)
     else:
         check_variable_and_dtype(
diff --git a/python/paddle/utils/layers_utils.py b/python/paddle/utils/layers_utils.py
index e90273f3e82ad..c49cb7bc42239 100644
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
@@ -457,8 +457,6 @@ def convert_shape_to_list(shape):
     else:
         if in_dygraph_mode():
             shape = shape.astype(int).tolist()
-        else:
-            shape = [shape]
     return shape
 
 
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index 82a0a684fddfb..71e41b92e96dd 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -1232,7 +1232,6 @@ def get_kernel_signature(self, place, egr_inps=None, egr_oups=None):
                 for attrs_name in self.attrs:
                     if self.attrs[attrs_name] is not None:
                         attrs_outputs[attrs_name] = self.attrs[attrs_name]
-
             kernel_sig = OpTestUtils._get_kernel_signature(
                 self.op_type,
                 dygraph_tensor_inputs,
@@ -2700,13 +2699,21 @@ def check_output_customized(
                     outs_p = self._calc_new_ir_output(place)
                     outs_p = [outs_p[out] for out in outs_p]
                     outs_p.sort(key=len)
-                    checker(outs_p)
+                    checker(outs_p[0])
 
-    def check_output_with_place_customized(self, checker, place):
+    def check_output_with_place_customized(
+        self, checker, place, check_new_ir=False
+    ):
         outs = self.calc_output(place)
         outs = [np.array(out) for out in outs]
         outs.sort(key=len)
         checker(outs)
+        if check_new_ir:
+            with paddle.pir_utils.IrGuard():
+                outs_p = self._calc_new_ir_output(place)
+                outs_p = [outs_p[out] for out in outs_p]
+                outs_p.sort(key=len)
+                checker(outs_p[0])
 
     def _assert_is_close(
         self,
diff --git a/test/legacy_test/test_empty_op.py b/test/legacy_test/test_empty_op.py
index 6920ef42dc6c6..44e1f2fe30fb6 100644
--- a/test/legacy_test/test_empty_op.py
+++ b/test/legacy_test/test_empty_op.py
@@ -27,10 +27,11 @@
 class TestEmptyOp(OpTest):
     def setUp(self):
         self.op_type = "empty"
+        self.python_api = paddle.tensor.empty
         self.init_config()
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         data_type = outs[0].dtype
@@ -108,6 +109,7 @@ def init_config(self):
 class TestEmptyOp_ShapeTensor(OpTest):
     def setUp(self):
         self.op_type = "empty"
+        self.python_api = paddle.empty
         self.init_config()
 
     def init_config(self):
@@ -119,7 +121,7 @@ def init_config(self):
         self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         data_type = outs[0].dtype
@@ -149,6 +151,7 @@ def verify_output(self, outs):
 class TestEmptyOp_ShapeTensorList(OpTest):
     def setUp(self):
         self.op_type = "empty"
+        self.python_api = paddle.empty
         self.init_config()
 
     def init_config(self):
@@ -169,7 +172,7 @@ def init_config(self):
         self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         data_type = outs[0].dtype
@@ -309,7 +312,7 @@ def setUp(self):
         self.outputs = {'Out': convert_float_to_uint16(output)}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         max_value = np.nanmax(outs[0])
diff --git a/test/legacy_test/test_exponential_op.py b/test/legacy_test/test_exponential_op.py
index a611b477f99ed..de92243084ffb 100644
--- a/test/legacy_test/test_exponential_op.py
+++ b/test/legacy_test/test_exponential_op.py
@@ -37,7 +37,7 @@ def config(self):
         self.dtype = "float64"
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         hist1, _ = np.histogram(outs[0], range=(0, 5))
@@ -360,7 +360,7 @@ def config(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         hist1, _ = np.histogram(outs[0], range=(0, 5))
diff --git a/test/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py
index f3f13e8b0c2b8..8f03e0f547e8d 100644
--- a/test/legacy_test/test_gaussian_random_op.py
+++ b/test/legacy_test/test_gaussian_random_op.py
@@ -26,7 +26,7 @@
 class TestGaussianRandomOp(OpTest):
     def setUp(self):
         self.op_type = "gaussian_random"
-        self.python_api = paddle.normal
+        self.python_api = paddle.tensor.random.gaussian
         self.set_attrs()
         self.inputs = {}
         self.use_mkldnn = False
@@ -46,7 +46,7 @@ def set_attrs(self):
         self.std = 2.0
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         self.assertEqual(outs[0].shape, (123, 92))
@@ -66,7 +66,7 @@ def verify_output(self, outs):
 class TestGaussianRandomFP16Op(OpTest):
     def setUp(self):
         self.op_type = "gaussian_random"
-        self.python_api = paddle.normal
+        self.python_api = paddle.tensor.random.gaussian
         self.set_attrs()
         self.inputs = {}
         self.use_mkldnn = False
@@ -88,7 +88,7 @@ def set_attrs(self):
 
     def test_check_output(self):
         self.check_output_with_place_customized(
-            self.verify_output, place=core.CUDAPlace(0)
+            self.verify_output, place=core.CUDAPlace(0), check_new_ir=True
         )
 
     def verify_output(self, outs):
@@ -103,13 +103,23 @@ def verify_output(self, outs):
         np.testing.assert_allclose(hist, hist2, rtol=0, atol=0.015)
 
 
+def gaussian_wrapper(dtype_=np.uint16):
+    def gauss_wrapper(shape, mean, std, seed, dtype=np.uint16, name=None):
+        return paddle.tensor.random.gaussian(
+            shape, mean, std, seed, dtype, name
+        )
+
+    return gauss_wrapper
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
 class TestGaussianRandomBF16Op(OpTest):
     def setUp(self):
         self.op_type = "gaussian_random"
-        self.python_api = paddle.normal
+        self.python_api = gaussian_wrapper(dtype_=np.uint16)
+        self.__class__.op_type = self.op_type
         self.set_attrs()
         self.inputs = {}
         self.use_mkldnn = False
@@ -131,7 +141,7 @@ def set_attrs(self):
 
     def test_check_output(self):
         self.check_output_with_place_customized(
-            self.verify_output, place=core.CUDAPlace(0)
+            self.verify_output, place=core.CUDAPlace(0), check_new_ir=True
         )
 
     def verify_output(self, outs):
@@ -158,6 +168,7 @@ class TestGaussianRandomOp_ShapeTensorList(TestGaussianRandomOp):
     def setUp(self):
         '''Test gaussian_random op with specified value'''
         self.op_type = "gaussian_random"
+        self.python_api = paddle.tensor.random.gaussian
         self.init_data()
         shape_tensor_list = []
         for index, ele in enumerate(self.shape):
@@ -185,7 +196,7 @@ def init_data(self):
         self.seed = 10
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
 
 class TestGaussianRandomOp2_ShapeTensorList(
@@ -231,7 +242,7 @@ def setUp(self):
         self.op_type = "gaussian_random"
         self.init_data()
         self.use_mkldnn = False
-
+        self.python_api = paddle.tensor.random.gaussian
         self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
         self.attrs = {
             'mean': self.mean,
diff --git a/test/legacy_test/test_gumbel_softmax_op.py b/test/legacy_test/test_gumbel_softmax_op.py
index 6332dd695e551..e3fbf15a299d8 100644
--- a/test/legacy_test/test_gumbel_softmax_op.py
+++ b/test/legacy_test/test_gumbel_softmax_op.py
@@ -46,10 +46,10 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_new_ir=True)
 
 
 class TestGumbelSoftmax_ZeroDim(OpTest):
@@ -68,10 +68,10 @@ def setUp(self):
         self.attrs = {"hard": True, "axis": -1}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_new_ir=True)
 
 
 class TestGumbelSoftmaxOp2(TestGumbelSoftmaxOp):
@@ -176,7 +176,7 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output_customized(self.accumulate_output)
+        self.check_output_customized(self.accumulate_output, check_new_ir=True)
         # Experiment should result in batch num .
         self.assertEqual(self.counts.sum(), self.shape[0])
 
@@ -192,7 +192,7 @@ def test_check_output(self):
         self.assertLess(np.max(np.abs(z)).item(), 2.58)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_new_ir=True)
 
 
 class TestGumbelSoftmaxOpGrad(unittest.TestCase):
diff --git a/test/legacy_test/test_multinomial_op.py b/test/legacy_test/test_multinomial_op.py
index 2b9ae85ef318f..bb4c53fb34821 100644
--- a/test/legacy_test/test_multinomial_op.py
+++ b/test/legacy_test/test_multinomial_op.py
@@ -48,6 +48,7 @@ class TestMultinomialOp(OpTest):
     def setUp(self):
         paddle.enable_static()
         self.op_type = "multinomial"
+        self.python_api = paddle.multinomial
         self.init_data()
         self.inputs = {"X": self.input_np}
 
@@ -58,7 +59,7 @@ def init_data(self):
         self.attrs = {"num_samples": 100000, "replacement": True}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def sample_output(self, out):
         return sample_output_one_dimension(out, 4)
@@ -109,6 +110,7 @@ class TestMultinomialFP16Op(OpTest):
     def setUp(self):
         paddle.enable_static()
         self.op_type = "multinomial"
+        self.python_api = paddle.multinomial
         self.dtype = np.float16
         self.init_data()
         self.inputs = {"X": self.input_np}
@@ -120,7 +122,7 @@ def init_data(self):
         self.attrs = {"num_samples": 100000, "replacement": True}
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def sample_output(self, out):
         return sample_output_one_dimension(out, 4)
@@ -450,6 +452,7 @@ def call_func(self, x):
         return out
 
     def test_static(self):
+        paddle.enable_static()
         main_prog = Program()
         starup_prog = Program()
         with program_guard(main_prog, starup_prog):
@@ -473,6 +476,7 @@ def test_static(self):
             # Test for Inference Predictor
             infer_outs = self.infer_prog()
             np.testing.assert_equal(infer_outs[1].shape, (3, 3))
+        paddle.disable_static()
 
 
 if __name__ == "__main__":
diff --git a/test/legacy_test/test_poisson_op.py b/test/legacy_test/test_poisson_op.py
index a60bd0c43a1cd..2002b94ac8013 100644
--- a/test/legacy_test/test_poisson_op.py
+++ b/test/legacy_test/test_poisson_op.py
@@ -63,7 +63,7 @@ def verify_output(self, outs):
         np.testing.assert_allclose(hist, prob, rtol=0.01)
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -408,7 +408,9 @@ def verify_output(self, outs):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place_customized(self.verify_output, place)
+        self.check_output_with_place_customized(
+            self.verify_output, place, check_new_ir=True
+        )
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
@@ -420,6 +422,7 @@ def test_check_grad(self):
             user_defined_grad_outputs=[
                 np.random.rand(2048, 1024).astype("float32")
             ],
+            check_new_ir=True,
         )
 
 
diff --git a/test/legacy_test/test_randint_op.py b/test/legacy_test/test_randint_op.py
index e6bdff627face..a48750eebdc7d 100644
--- a/test/legacy_test/test_randint_op.py
+++ b/test/legacy_test/test_randint_op.py
@@ -36,6 +36,7 @@ def output_hist(out):
 class TestRandintOp(OpTest):
     def setUp(self):
         self.op_type = "randint"
+        self.python_api = paddle.randint
         self.inputs = {}
         self.init_attrs()
         self.outputs = {"Out": np.zeros((10000, 784)).astype("float32")}
@@ -45,7 +46,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -70,6 +71,7 @@ def test_errors(self):
 class TestRandintOp_attr_tensorlist(OpTest):
     def setUp(self):
         self.op_type = "randint"
+        self.python_api = paddle.randint
         self.new_shape = (10000, 784)
         shape_tensor = []
         for index, ele in enumerate(self.new_shape):
@@ -85,7 +87,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
@@ -95,6 +97,7 @@ def verify_output(self, outs):
 class TestRandint_attr_tensor(OpTest):
     def setUp(self):
         self.op_type = "randint"
+        self.python_api = paddle.randint
         self.inputs = {"ShapeTensor": np.array([10000, 784]).astype("int64")}
         self.init_attrs()
         self.outputs = {"Out": np.zeros((10000, 784)).astype("int64")}
@@ -104,7 +107,7 @@ def init_attrs(self):
         self.output_hist = output_hist
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
diff --git a/test/legacy_test/test_randperm_op.py b/test/legacy_test/test_randperm_op.py
index 0a5bed32f6dee..ceb8b82aa0f55 100644
--- a/test/legacy_test/test_randperm_op.py
+++ b/test/legacy_test/test_randperm_op.py
@@ -83,7 +83,7 @@ def init_attrs(self):
         pass
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, check_new_ir=True)
 
     def verify_output(self, outs):
         out_np = np.array(outs[0])

From 04d09e6fb69d45a354e0a3bac184114a88e21568 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Fri, 22 Sep 2023 11:33:32 +0800
Subject: [PATCH 062/115] move ir_copy from namespace optim to ir_utils
 (#57582)

---
 .../cinn/auto_schedule/analysis/analyze_ir.cc |  2 +-
 .../cost_model/feature_extractor.cc           |  2 +-
 .../database/jsonfile_database_test.cc        |  2 +-
 .../search_strategy/evolutionary_search.cc    |  8 ++--
 .../cinn/auto_schedule/task/task_optimizer.cc |  4 +-
 .../cinn/auto_schedule/task/task_registry.h   |  2 +-
 paddle/cinn/backends/codegen_cuda_util.h      |  2 +-
 paddle/cinn/common/cas.cc                     | 16 +++----
 paddle/cinn/ir/schedule/ir_schedule.cc        | 43 ++++++++++---------
 paddle/cinn/ir/schedule/ir_schedule_util.cc   | 18 ++++----
 paddle/cinn/ir/test/ir_copy_test.cc           |  7 +--
 paddle/cinn/ir/test/schedule_desc_test.cc     | 17 +++++---
 paddle/cinn/ir/utils/ir_copy.cc               | 12 +++---
 paddle/cinn/ir/utils/ir_copy.h                |  6 +--
 paddle/cinn/ir/utils/ir_replace.cc            |  4 +-
 paddle/cinn/optim/compute_inline_expand.cc    |  6 +--
 paddle/cinn/optim/optimize.cc                 |  4 +-
 paddle/cinn/optim/replace_call_with_expr.cc   |  4 +-
 paddle/cinn/optim/replace_var_with_expr.cc    |  2 +-
 paddle/cinn/optim/transform_gpu_forloop.cc    |  8 ++--
 paddle/cinn/optim/unroll_loops.cc             |  2 +-
 paddle/cinn/optim/vectorize_loops.cc          | 19 ++++----
 paddle/cinn/poly/stage.cc                     |  2 +-
 paddle/cinn/pybind/optim.cc                   |  4 +-
 24 files changed, 101 insertions(+), 95 deletions(-)

diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index da2c063d9c00d..d8d87328bd3f8 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -41,7 +41,7 @@ std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
   for (const ir::Expr& e : indices) {
     // Whether we have to convert other types, like const numbers to Var?
     if (e.As<ir::_Var_>() != nullptr) {
-      ir::Expr copy_e = optim::IRCopy(e);
+      ir::Expr copy_e = ir::ir_utils::IRCopy(e);
       ir::_Var_* var_ref = copy_e.As<ir::_Var_>();
       result.emplace_back(ir::Var(var_ref));
     }
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
index db2d3f62ed6a9..1228d5abaf072 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
@@ -218,7 +218,7 @@ void FeatureExtractor::Visit(const For *x) {
 }
 
 void FeatureExtractor::Visit(const PolyFor *x) {
-  Expr copy = optim::IRCopy(Expr(x));
+  Expr copy = ir::ir_utils::IRCopy(Expr(x));
   feature_.IntoLoopBlock();
   optim::TransformPolyForToFor(&copy);
   ir::For *loop = copy.As<For>();
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
index 9a7bf9d568bc3..6d4e8a70cc17b 100644
--- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
@@ -56,7 +56,7 @@ ir::IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs,
                               const std::string& task_key) {
   std::vector<Expr> exprs;
   for (auto&& func : lowered_funcs) {
-    exprs.emplace_back(optim::IRCopy(func->body));
+    exprs.emplace_back(ir::ir_utils::IRCopy(func->body));
   }
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   task_registry->Regist(task_key, ir::ModuleExpr(exprs));
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
index 1881697237e82..5bb351767e8cb 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
@@ -134,7 +134,7 @@ std::vector<SearchState> EvolutionarySearch::GetTopKCandidatesFromDatabase(
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   for (auto&& record : records) {
     ir::IRSchedule ir_sch(
-        optim::IRCopy(task_registry->Get(task_key)->module_expr),
+        ir::ir_utils::IRCopy(task_registry->Get(task_key)->module_expr),
         utils::ForkRandomState(&rand_seed_));
     ir::ScheduleDesc::ReplayWithProto(record.trace, &ir_sch);
     results.emplace_back(SearchState(std::move(ir_sch), record.predicted_cost));
@@ -181,9 +181,9 @@ SearchState EvolutionarySearch::CrossOver(const SearchState& state1,
 
   for (size_t i = 0; i < father_exprs.size(); ++i) {
     if (utils::SampleUniformInt(0, 2, &rand_seed_) == 0) {
-      cross_over_exprs.push_back(optim::IRCopy(father_exprs[i]));
+      cross_over_exprs.push_back(ir::ir_utils::IRCopy(father_exprs[i]));
     } else {
-      cross_over_exprs.push_back(optim::IRCopy(mother_exprs[i]));
+      cross_over_exprs.push_back(ir::ir_utils::IRCopy(mother_exprs[i]));
     }
   }
   auto res = SearchState(ir::IRSchedule(ir::ModuleExpr(cross_over_exprs),
@@ -217,7 +217,7 @@ SearchState EvolutionarySearch::Mutate(
   const auto& task_key = tune_task_.serialized_key;
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   ir::IRSchedule new_ir_sch(
-      optim::IRCopy(task_registry->Get(task_key)->module_expr),
+      ir::ir_utils::IRCopy(task_registry->Get(task_key)->module_expr),
       utils::ForkRandomState(rand_seed));
   new_trace.Replay(&new_ir_sch, true);
   ApplyPostScheduleRules(&new_ir_sch, post_schedule_rules_);
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
index ea1b18c764533..d76797d9953ec 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -247,7 +247,7 @@ TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(
   auto& optimized_funcs = result.functions;
   auto& best_cost = result.cost;
   // use initial lowered function as default result
-  optimized_funcs = optim::IRCopy(task_->lowered_funcs);
+  optimized_funcs = ir::ir_utils::IRCopy(task_->lowered_funcs);
   if (options.num_measure_trials ==
       0) {  // no need to measure and simply return the best searched
     std::vector<MeasureInput> measure_candidates;
@@ -347,7 +347,7 @@ std::vector<SearchState> TaskOptimizer::SearchOneRound(
     CHECK_EQ(best_exprs.size(), task_->lowered_funcs.size())
         << "RuntimeError: Expr size is not equal to LoweredFunc size in "
            "TaskOptimizer";
-    auto init_funcs = optim::IRCopy(task_->lowered_funcs);
+    auto init_funcs = ir::ir_utils::IRCopy(task_->lowered_funcs);
     std::vector<ir::LoweredFunc> valid_funcs;
     for (size_t j = 0; j < best_exprs.size(); ++j) {
       auto updated_f =
diff --git a/paddle/cinn/auto_schedule/task/task_registry.h b/paddle/cinn/auto_schedule/task/task_registry.h
index 7cff52c220461..22eb49fa2c0a1 100644
--- a/paddle/cinn/auto_schedule/task/task_registry.h
+++ b/paddle/cinn/auto_schedule/task/task_registry.h
@@ -63,7 +63,7 @@ class InitialTaskRegistry : public Registry<InitialTaskInfo> {
     std::lock_guard<std::mutex> guard(registering_mutex);
     if (fmap_.count(task_key) == 0) {
       InitialTaskInfo* task_info =
-          new InitialTaskInfo(task_key, optim::IRCopy(module_expr));
+          new InitialTaskInfo(task_key, ir::ir_utils::IRCopy(module_expr));
       __REGISTER__(task_key, task_info);
     }
   }
diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h
index 1b406ef2457e1..6ae64cbb36172 100644
--- a/paddle/cinn/backends/codegen_cuda_util.h
+++ b/paddle/cinn/backends/codegen_cuda_util.h
@@ -127,7 +127,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
   }
 
   Expr CreateDeviceFunctionGivenDeviceKernel(Expr expr) {
-    auto copied = optim::IRCopy(expr);
+    auto copied = ir::ir_utils::IRCopy(expr);
     auto* lowered_func = copied.as_lowered_func();
     lowered_func->name = GenDeviceKernelName(lowered_func->name);
     return copied;
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index bf1c9092ed5eb..727c3b98e4ced 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -1584,7 +1584,7 @@ bool CASasSymbol(Expr expr) {
 
 Expr ConvertCinnToCAS(Expr expr) {
   VLOG(7) << "Begin ConvertCinnToCAS " << expr;
-  Expr copied = optim::IRCopy(expr);
+  Expr copied = ir::ir_utils::IRCopy(expr);
   struct Mutator : public ir::IRMutator<ir::Expr*> {
     void operator()(Expr* expr) { Visit(expr); }
     void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
@@ -1710,7 +1710,7 @@ Expr ConvertCinnToCAS(Expr expr) {
  * simplify the condition ensures correctness, though not sufficient.
  */
 Expr ReplaceMinToConstant(Expr expr) {
-  Expr copied = optim::IRCopy(expr);
+  Expr copied = ir::ir_utils::IRCopy(expr);
   struct Mutator : public ir::IRMutator<ir::Expr*> {
     void operator()(Expr* expr) { Visit(expr); }
     void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
@@ -1727,10 +1727,10 @@ Expr ReplaceMinToConstant(Expr expr) {
       auto min_b = op->b();
       if (min_a.is_constant() && !min_b.is_constant()) {
         CHECK(min_a->type().is_integer());
-        *expr = optim::IRCopy(min_a);
+        *expr = ir::ir_utils::IRCopy(min_a);
       } else if (min_b.is_constant() && !min_a.is_constant()) {
         CHECK(min_b->type().is_integer());
-        *expr = optim::IRCopy(min_b);
+        *expr = ir::ir_utils::IRCopy(min_b);
       }
     }
   };
@@ -1743,7 +1743,7 @@ Expr ReplaceMinToConstant(Expr expr) {
  * constant value and 1 inconstant value, return the constant max value.
  */
 Expr ReplaceMaxToConstant(Expr expr) {
-  Expr copied = optim::IRCopy(expr);
+  Expr copied = ir::ir_utils::IRCopy(expr);
   struct Mutator : public ir::IRMutator<ir::Expr*> {
     void operator()(Expr* expr) { Visit(expr); }
     void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
@@ -1760,10 +1760,10 @@ Expr ReplaceMaxToConstant(Expr expr) {
       auto max_b = op->b();
       if (max_a.is_constant() && !max_b.is_constant()) {
         CHECK(max_a->type().is_integer());
-        *expr = optim::IRCopy(max_a);
+        *expr = ir::ir_utils::IRCopy(max_a);
       } else if (max_b.is_constant() && !max_a.is_constant()) {
         CHECK(max_b->type().is_integer());
-        *expr = optim::IRCopy(max_b);
+        *expr = ir::ir_utils::IRCopy(max_b);
       }
     }
   };
@@ -1773,7 +1773,7 @@ Expr ReplaceMaxToConstant(Expr expr) {
 
 Expr ConvertCasToCinn(Expr expr) {
   VLOG(7) << "Begin ConvertCasToCinn : " << expr;
-  Expr copied = optim::IRCopy(expr);
+  Expr copied = ir::ir_utils::IRCopy(expr);
 
   struct Mutator : ir::IRMutator<Expr*> {
     void operator()(Expr* expr) { Visit(expr); }
diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc
index fab8a53deb121..b4067d1fbd05a 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule.cc
@@ -189,7 +189,7 @@ std::vector<Expr> ScheduleImpl::Split(const Expr& loop,
     new_loop_vars.push_back(temp_var);
   }
   substitute_value = common::AutoSimplify(substitute_value);
-  Expr new_node = optim::IRCopy(for_node->body);
+  Expr new_node = ir::ir_utils::IRCopy(for_node->body);
   ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value});
   std::vector<Expr> splited_loops;
   splited_loops.resize(processed_factors.size());
@@ -252,7 +252,7 @@ Expr ScheduleImpl::Fuse(const std::vector<Expr>& loops) {
   }
   substitute_value[0] = fused_expr;
 
-  Expr fused_body = optim::IRCopy(for_nodes.back()->body);
+  Expr fused_body = ir::ir_utils::IRCopy(for_nodes.back()->body);
   ReplaceExpr(&fused_body, loop_vars, substitute_value);
   optim::Simplify(&fused_body);
   Expr fused_extent(1);
@@ -321,7 +321,7 @@ void ScheduleImpl::MutateForType(const Expr& loop,
       << "loop is not serial, current forloop type is "
       << static_cast<int>(for_node->for_type()) << ", and it cannot become "
       << static_cast<int>(for_type);
-  auto loop_copy = optim::IRCopy(loop);
+  auto loop_copy = ir::ir_utils::IRCopy(loop);
   auto* new_for_node = loop_copy.As<ir::For>();
   CHECK(new_for_node);
   new_for_node->set_for_type(for_type);
@@ -674,7 +674,7 @@ struct RfCreater : public ir::IRMutator<> {
     CHECK(root_realize);
     auto root_block = root_realize->schedule_block.As<ScheduleBlock>();
     CHECK(root_block);
-    Expr root_loop = optim::IRCopy(root_block->body);
+    Expr root_loop = ir::ir_utils::IRCopy(root_block->body);
     if (auto block = root_loop.As<Block>()) {
       CHECK_EQ(block->stmts.size(), 1U)
           << "rfactor root should only have one block stmt";
@@ -685,13 +685,13 @@ struct RfCreater : public ir::IRMutator<> {
     auto rf_for = rf_loop_.As<For>();
     CHECK(rf_for);
     // create new rfactor forloops
-    Expr new_rf_forloop = optim::IRCopy(root_loop);
+    Expr new_rf_forloop = ir::ir_utils::IRCopy(root_loop);
     RfMutator rf_mutator(rf_loop_, rf_axis_);
     rf_mutator(&new_rf_forloop);
     VLOG(3) << "After RfMutator, new rf_forloop is\n" << new_rf_forloop;
     auto new_rf_tensor = rf_mutator.GetNewRfTensor();
     // create final write-back forloops
-    Expr final_forloop = optim::IRCopy(root_loop);
+    Expr final_forloop = ir::ir_utils::IRCopy(root_loop);
     FinalMutator final_mutator(rf_loop_, rf_axis_, new_rf_tensor);
     final_mutator(&final_forloop);
     VLOG(3) << "After FinalMuator, final write-back forloop is\n"
@@ -721,7 +721,7 @@ struct CacheReadRewriter : public ir::IRMutator<> {
  public:
   static Expr Rewrite(const Expr& root, CacheBlockInfo* info) {
     CacheReadRewriter rewriter(root, info);
-    Expr new_root = optim::IRCopy(root);
+    Expr new_root = ir::ir_utils::IRCopy(root);
     rewriter(&new_root);
     return new_root;
   }
@@ -762,7 +762,7 @@ struct CacheWriteRewriter : public ir::IRMutator<> {
  public:
   static Expr Rewrite(const Expr& root, CacheBlockInfo* info) {
     CacheWriteRewriter rewriter(root, info);
-    Expr new_root = optim::IRCopy(root);
+    Expr new_root = ir::ir_utils::IRCopy(root);
     rewriter.mutate_cache_block = true;
     rewriter(&info->cache_block);
     rewriter.mutate_cache_block = false;
@@ -1194,7 +1194,7 @@ struct LoopReconstructor : public ir::IRMutator<> {
                             loop_.As<ir::For>()->device_api,
                             std::move(loop_body));
     }
-    new_loop_ = optim::IRCopy(loop_);
+    new_loop_ = ir::ir_utils::IRCopy(loop_);
 
     // Replace the copied Tensor object with the original Tensor object,
     // to ensure that the same Tensor in a AST is the same object.
@@ -1431,9 +1431,9 @@ void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
   }
 
   Expr result = loops.size() < block_loops.size()
-                    ? optim::IRCopy(block_loops[loops.size()])
-                    : optim::IRCopy(this_block);
-  Expr new_loop = optim::IRCopy(this_loop);
+                    ? ir::ir_utils::IRCopy(block_loops[loops.size()])
+                    : ir::ir_utils::IRCopy(this_block);
+  Expr new_loop = ir::ir_utils::IRCopy(this_loop);
 
   // Get the body of block_loop under the same loops
   auto body = block_loops.at(loops.size() - 1).As<ir::For>()->body;
@@ -1608,7 +1608,7 @@ void ComputeInliner::Visit(const ir::Load* expr, Expr* op) {
 Expr ComputeInliner::ReplaceInlinedTensor(Expr* load) {
   CHECK(load->As<ir::Load>());
   SetIndexSubstitution(load->As<ir::Load>()->indices);
-  Expr value_copy = optim::IRCopy(inlined_store_.As<Store>()->value);
+  Expr value_copy = ir::ir_utils::IRCopy(inlined_store_.As<Store>()->value);
   ReplaceExpr(&value_copy, idx_sub_var_, idx_sub_expr_);
   return value_copy;
 }
@@ -1684,7 +1684,7 @@ void ReverseComputeInliner::Visit(const ir::Store* expr, Expr* op) {
 Expr ReverseComputeInliner::ReplaceInlinedTensor(Expr* load) {
   CHECK(load->As<ir::Load>());
   SetIndexSubstitution(load->As<ir::Load>()->indices);
-  Expr value_copy = optim::IRCopy(inlined_store_.As<Store>()->value);
+  Expr value_copy = ir::ir_utils::IRCopy(inlined_store_.As<Store>()->value);
   return value_copy;
 }
 
@@ -1699,7 +1699,7 @@ Expr ReverseComputeInliner::ReplaceTargetTensor(Expr* store) {
     idx_sub_expr_.emplace_back(idx_vars_[i]);
   }
 
-  Expr value_copy = optim::IRCopy(target_store_);
+  Expr value_copy = ir::ir_utils::IRCopy(target_store_);
   ReplaceExpr(&value_copy, idx_sub_var_, idx_sub_expr_);
   return value_copy;
 }
@@ -1936,7 +1936,7 @@ void ScheduleImpl::Annotate(const Expr& block,
   CHECK(block.As<ir::ScheduleBlockRealize>());
   CHECK(block.As<ir::ScheduleBlockRealize>()
             ->schedule_block.As<ir::ScheduleBlock>());
-  auto copied_block = optim::IRCopy(block);
+  auto copied_block = ir::ir_utils::IRCopy(block);
   auto* schedule_block = copied_block.As<ir::ScheduleBlockRealize>()
                              ->schedule_block.As<ir::ScheduleBlock>();
   schedule_block->attrs.emplace(key, value);
@@ -2195,7 +2195,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   }
   CHECK(!used_target_loop_vars.empty());
   std::vector<Expr> used_target_loops;
-  auto expr_copy = optim::IRCopy(expr);
+  auto expr_copy = ir::ir_utils::IRCopy(expr);
   for (auto& var : used_target_loop_vars) {
     auto find_loop_var = ir::ir_utils::CollectIRNodesWithoutTensor(
         expr_copy,
@@ -2220,7 +2220,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   VLOG(3) << "changed_loop_num is : " << changed_loop_num;
   VLOG(3) << "old_iter_values.size() is : " << old_iter_values.size();
   if (changed_loop_num >= static_cast<int>(old_iter_values.size())) {
-    new_loop = optim::IRCopy(block);
+    new_loop = ir::ir_utils::IRCopy(block);
     new_loop.As<ir::ScheduleBlockRealize>()->iter_values = new_iter_values;
   } else {
     CHECK(old_iter_values[changed_loop_num].as_var());
@@ -2234,7 +2234,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
         },
         true);
     CHECK_EQ(find_partial_loop.size(), 1U);
-    new_loop = optim::IRCopy(*find_partial_loop.begin());
+    new_loop = ir::ir_utils::IRCopy(*find_partial_loop.begin());
     auto find_schedule_block = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_loop,
         [&](const Expr* x) { return x->As<ir::ScheduleBlockRealize>(); },
@@ -2332,13 +2332,14 @@ IRSchedule::IRSchedule(ir::ModuleExpr&& mod_expr,
 }
 
 IRSchedule::IRSchedule(const IRSchedule& other)
-    : impl_(std::make_unique<ScheduleImpl>(optim::IRCopy(other.GetModule()))),
+    : impl_(std::make_unique<ScheduleImpl>(
+          ir::ir_utils::IRCopy(other.GetModule()))),
       trace_(other.trace_) {
   this->InitSeed(other.ForkSeed());
 }
 
 IRSchedule& IRSchedule::operator=(const IRSchedule& src) {
-  impl_ = std::make_unique<ScheduleImpl>(optim::IRCopy(src.GetModule()));
+  impl_ = std::make_unique<ScheduleImpl>(ir::ir_utils::IRCopy(src.GetModule()));
   trace_ = src.trace_;
   this->InitSeed(src.ForkSeed());
   return *this;
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index 45779788e9c54..a4c9ef62b25f2 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -348,8 +348,8 @@ IterRange GetAccessedRange(const Expr& index,
     var_maxs.emplace_back(range.min + range.extent - 1);
   }
 
-  Expr indice_min = optim::IRCopy(index);
-  Expr indice_max = optim::IRCopy(index);
+  Expr indice_min = ir::ir_utils::IRCopy(index);
+  Expr indice_max = ir::ir_utils::IRCopy(index);
   // replace the var by the corresponding iter_value
   ReplaceExpr(&indice_min, iter_vars, var_mins);
   ReplaceExpr(&indice_max, iter_vars, var_maxs);
@@ -408,7 +408,7 @@ std::vector<IterRange> CalculateTensorRegions(
 
   std::vector<IterRange> result;
   for (int i = 0; i < tensor_indices.size(); ++i) {
-    Expr binded_index = optim::IRCopy(tensor_indices[i]);
+    Expr binded_index = ir::ir_utils::IRCopy(tensor_indices[i]);
     ReplaceExpr(&binded_index, iter_vars, iter_values);
     auto range = GetAccessedRange(binded_index, loop_vars, loop_ranges);
 
@@ -656,7 +656,7 @@ Expr ConstructOtherStmtChain(const std::vector<Expr>& stmts,
                              const std::vector<int> reordered_indices) {
   Expr new_loop;
   for (int i = reordered_indices.size() - 1; i >= 0; --i) {
-    Expr temp = optim::IRCopy(loops[reordered_indices[i]]);
+    Expr temp = ir::ir_utils::IRCopy(loops[reordered_indices[i]]);
     CHECK(temp.defined());
     CHECK(temp.As<ir::For>());
     if (new_loop.defined()) {
@@ -695,10 +695,10 @@ Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
     Expr temp;
     if (loop_set.count(loop_in_chain)) {
       CHECK_GE(index, 0);
-      temp = optim::IRCopy(ordered_loops[index]);
+      temp = ir::ir_utils::IRCopy(ordered_loops[index]);
       --index;
     } else {
-      temp = optim::IRCopy(loop_in_chain);
+      temp = ir::ir_utils::IRCopy(loop_in_chain);
     }
     CHECK(temp.defined());
     CHECK(temp.As<ir::For>());
@@ -1029,9 +1029,9 @@ std::vector<IterRange> CalculateRequiredRegions(
     for (const Expr& req_block : required_blocks) {
       CHECK(req_block.As<ir::ScheduleBlockRealize>());
       Expr block_body =
-          optim::IRCopy(req_block.As<ir::ScheduleBlockRealize>()
-                            ->schedule_block.As<ir::ScheduleBlock>()
-                            ->body);
+          ir::ir_utils::IRCopy(req_block.As<ir::ScheduleBlockRealize>()
+                                   ->schedule_block.As<ir::ScheduleBlock>()
+                                   ->body);
       auto iter_vars = req_block.As<ir::ScheduleBlockRealize>()
                            ->schedule_block.As<ir::ScheduleBlock>()
                            ->iter_vars;
diff --git a/paddle/cinn/ir/test/ir_copy_test.cc b/paddle/cinn/ir/test/ir_copy_test.cc
index cd3199d4947dd..570e1b12aa213 100644
--- a/paddle/cinn/ir/test/ir_copy_test.cc
+++ b/paddle/cinn/ir/test/ir_copy_test.cc
@@ -19,13 +19,14 @@
 #include "paddle/cinn/ir/utils/ir_printer.h"
 
 namespace cinn {
-namespace optim {
+namespace ir {
+namespace ir_utils {
 
 TEST(IrCopy, basic) {
   Expr a(1.f);
   auto aa = IRCopy(a);
   LOG(INFO) << "aa " << aa;
 }
-
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/test/schedule_desc_test.cc b/paddle/cinn/ir/test/schedule_desc_test.cc
index 66a01b14b08d7..a798101813868 100644
--- a/paddle/cinn/ir/test/schedule_desc_test.cc
+++ b/paddle/cinn/ir/test/schedule_desc_test.cc
@@ -95,7 +95,7 @@ std::vector<ir::LoweredFunc> LowerCompute(
 IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs) {
   std::vector<Expr> exprs;
   for (auto&& func : lowered_funcs) {
-    exprs.emplace_back(optim::IRCopy(func->body));
+    exprs.emplace_back(ir::ir_utils::IRCopy(func->body));
   }
   return ir::IRSchedule(ir::ModuleExpr(exprs));
 }
@@ -106,10 +106,11 @@ std::string SourceCodeGen(const ModuleExpr& module_expr,
                           const Target& target) {
   auto exprs = module_expr.GetExprs();
   CHECK_EQ(exprs.size(), lowered_funcs.size()) << "size of func is not euqal";
-  std::vector<ir::LoweredFunc> updated_funcs = optim::IRCopy(lowered_funcs);
+  std::vector<ir::LoweredFunc> updated_funcs =
+      ir::ir_utils::IRCopy(lowered_funcs);
   Module::Builder builder("test_module", target);
   for (auto i = 0; i < lowered_funcs.size(); ++i) {
-    updated_funcs[i]->body = optim::IRCopy(exprs.at(i));
+    updated_funcs[i]->body = ir::ir_utils::IRCopy(exprs.at(i));
     builder.AddFunction(updated_funcs[i]);
   }
   auto module = builder.Build();
@@ -839,12 +840,14 @@ TEST_F(TestScheduleDesc, StepKind_MergeExprs) {
   auto funcs_1 =
       LowerCompute({32, 32, 32}, target, true, "elementwise-add_const");
 
-  ir::IRSchedule ir_sch = ir::IRSchedule(ir::ModuleExpr(
-      {optim::IRCopy(funcs_0[0]->body), optim::IRCopy(funcs_0[0]->body)}));
+  ir::IRSchedule ir_sch =
+      ir::IRSchedule(ir::ModuleExpr({ir::ir_utils::IRCopy(funcs_0[0]->body),
+                                     ir::ir_utils::IRCopy(funcs_0[0]->body)}));
   ir_sch.MergeExprs();
   trace.Append(ScheduleDesc::Step("MergeExprs", {}, {}, {}));
-  ir::IRSchedule replay_sch = ir::IRSchedule(ir::ModuleExpr(
-      {optim::IRCopy(funcs_0[0]->body), optim::IRCopy(funcs_0[0]->body)}));
+  ir::IRSchedule replay_sch =
+      ir::IRSchedule(ir::ModuleExpr({ir::ir_utils::IRCopy(funcs_0[0]->body),
+                                     ir::ir_utils::IRCopy(funcs_0[0]->body)}));
   trace.Replay(&replay_sch);
 
   auto lhs_exprs = ir_sch.GetModule().GetExprs();
diff --git a/paddle/cinn/ir/utils/ir_copy.cc b/paddle/cinn/ir/utils/ir_copy.cc
index 22d7c99bcd322..b157f6030a5e6 100644
--- a/paddle/cinn/ir/utils/ir_copy.cc
+++ b/paddle/cinn/ir/utils/ir_copy.cc
@@ -27,9 +27,9 @@
 #include "paddle/cinn/ir/utils/ir_printer.h"
 
 namespace cinn {
-namespace optim {
-using namespace ir;  // NOLINT
-
+namespace ir {
+namespace ir_utils {
+namespace {
 struct IRCopyVisitor : public ir::IRVisitorRequireReImpl<Expr> {
   // Use maps to unify all the copied tensors and buffers.
   std::map<std::string, ir::_Tensor_*> tensor_map;
@@ -474,7 +474,7 @@ Expr IRCopyVisitor::Visit(const ir::intrinsics::BuiltinIntrin* op) {
   return intrinsics::BuiltinIntrin::Make(
       op->name, op->args, op->id, op->arg_nums, op->type());
 }
-
+}  // namespace
 Expr IRCopy(Expr x) {
   IRCopyVisitor visitor;
   auto copied = visitor.Visit(&x);
@@ -507,6 +507,6 @@ std::vector<ir::LoweredFunc> IRCopy(const std::vector<ir::LoweredFunc>& x) {
   }
   return res;
 }
-
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_copy.h b/paddle/cinn/ir/utils/ir_copy.h
index 726739394eba6..594f07e91cfa0 100644
--- a/paddle/cinn/ir/utils/ir_copy.h
+++ b/paddle/cinn/ir/utils/ir_copy.h
@@ -24,9 +24,8 @@ namespace cinn {
 
 namespace ir {
 class ModuleExpr;
-}  // namespace ir
 
-namespace optim {
+namespace ir_utils {
 
 //! Shallow copy an expression.
 Expr IRCopy(Expr x);
@@ -39,5 +38,6 @@ ir::LoweredFunc IRCopy(const ir::LoweredFunc& x);
 
 std::vector<ir::LoweredFunc> IRCopy(const std::vector<ir::LoweredFunc>& x);
 
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_replace.cc b/paddle/cinn/ir/utils/ir_replace.cc
index da2305359c5e9..a1b3138291023 100644
--- a/paddle/cinn/ir/utils/ir_replace.cc
+++ b/paddle/cinn/ir/utils/ir_replace.cc
@@ -43,14 +43,14 @@ struct IrReplaceMutator : ir::IRMutator<Expr*> {
   void Visit(const ir::_Var_* op, Expr* expr) override {
     if (op->node_type() == from_->node_type() &&
         from_repr_ == GetStreamCnt(*expr)) {
-      *expr = optim::IRCopy(to_);
+      *expr = ir::ir_utils::IRCopy(to_);
     }
   }
 
   void Visit(const ir::Broadcast* op, Expr* expr) override {
     if (op->node_type() == from_->node_type() &&
         from_repr_ == GetStreamCnt(*expr)) {
-      *expr = optim::IRCopy(to_);
+      *expr = ir::ir_utils::IRCopy(to_);
     }
   }
 
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
index d4123729bc53f..f9563449fb128 100644
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -150,7 +150,7 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
         }
         ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
         for (int i = 0; i < node->indices.size(); i++) {
-          auto temp = optim::IRCopy(node->indices[i]);
+          auto temp = ir::ir_utils::IRCopy(node->indices[i]);
           ir::IRMutator<>::Visit(&temp, &temp);
           node->indices[i] = temp;
         }
@@ -159,7 +159,7 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
       } else {
         ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
         for (int i = 0; i < node->indices.size(); i++) {
-          auto temp = optim::IRCopy(node->indices[i]);
+          auto temp = ir::ir_utils::IRCopy(node->indices[i]);
           ir::IRMutator<>::Visit(&temp, &temp);
           node->indices[i] = temp;
         }
@@ -167,7 +167,7 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
     } else {
       ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
       for (int i = 0; i < node->indices.size(); i++) {
-        auto temp = optim::IRCopy(node->indices[i]);
+        auto temp = ir::ir_utils::IRCopy(node->indices[i]);
         ir::IRMutator<>::Visit(&temp, &temp);
         node->indices[i] = temp;
       }
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
index 3764e1bd616e2..d38bc59c058ad 100644
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -42,7 +42,7 @@ Expr Optimize(Expr e,
               bool runtime_debug_info,
               bool remove_gpu_for_loops) {
   CHECK(e.defined());
-  auto copied = IRCopy(e);
+  auto copied = ir::ir_utils::IRCopy(e);
 
   FoldCINNCallArguments(&copied);
   TransformPolyForToFor(&copied);
@@ -84,7 +84,7 @@ Expr Optimize(Expr e,
 }
 
 ir::Module Optimize(const ir::Module& module, const Target& target) {
-  auto copied = IRCopy(Expr(module));
+  auto copied = ir::ir_utils::IRCopy(Expr(module));
   UnrollLoop(&copied);
   VectorizeLoops(&copied, Target());
   VLOG(10) << "After VectorizeLoops:" << copied.as_module_ref();
diff --git a/paddle/cinn/optim/replace_call_with_expr.cc b/paddle/cinn/optim/replace_call_with_expr.cc
index d63210d1d28f1..62103aa341b59 100644
--- a/paddle/cinn/optim/replace_call_with_expr.cc
+++ b/paddle/cinn/optim/replace_call_with_expr.cc
@@ -36,7 +36,7 @@ struct ReplaceCallWithExprModifier : public ir::IRMutator<> {
     VLOG(3) << "Processing Call node " << *op;
     if (statement_ != node->name) return;
 
-    Expr expr_candidate = IRCopy(candidate_);
+    Expr expr_candidate = ir::ir_utils::IRCopy(candidate_);
     VLOG(3) << "Original candidate expr: " << candidate_;
     VLOG(3) << "Copied candidate expr: " << expr_candidate;
 
@@ -62,7 +62,7 @@ void ReplaceIslCallWithExpr(Expr *e,
                             const Expr &candidate,
                             const std::map<std::string, Expr> &axis_map) {
   VLOG(3) << "ReplaceCallWithExpr, original expression: " << candidate;
-  Expr copied = IRCopy(candidate);
+  Expr copied = ir::ir_utils::IRCopy(candidate);
   // update the axis in the copied expression.
 
   // we treat the Store node as the normal statement, the others like Call node
diff --git a/paddle/cinn/optim/replace_var_with_expr.cc b/paddle/cinn/optim/replace_var_with_expr.cc
index 57ab47d7c0250..7de44e3470e66 100644
--- a/paddle/cinn/optim/replace_var_with_expr.cc
+++ b/paddle/cinn/optim/replace_var_with_expr.cc
@@ -41,7 +41,7 @@ struct ReplaceVarWithExprMutator : public ir::IRMutator<> {
  private:
   void Visit(const ir::_Var_* expr, Expr* op) override {
     if (expr->name == var_->name && (do_replace_ || visit_all_)) {
-      auto copied = IRCopy(expr_);
+      auto copied = ir::ir_utils::IRCopy(expr_);
       *op = copied;
     }
   }
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 7b30f75bf9652..a62e24c539e5f 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -185,7 +185,7 @@ class RestructureVarNodes : public ir::IRMutator<> {
   void Visit(const ir::Load *load, Expr *op) override {
     std::vector<ir::Expr> indices_copied;
     for (const ir::Expr &indice : load->indices) {
-      indices_copied.push_back(IRCopy(indice));
+      indices_copied.push_back(ir::ir_utils::IRCopy(indice));
     }
     op->As<ir::Load>()->indices = indices_copied;
 
@@ -195,7 +195,7 @@ class RestructureVarNodes : public ir::IRMutator<> {
   void Visit(const ir::Store *store, Expr *op) override {
     std::vector<ir::Expr> indices_copied;
     for (const ir::Expr &indice : store->indices) {
-      indices_copied.push_back(IRCopy(indice));
+      indices_copied.push_back(ir::ir_utils::IRCopy(indice));
     }
     op->As<ir::Store>()->indices = indices_copied;
 
@@ -585,7 +585,7 @@ class ResizeBufferSizeVisitor : public ir::IRMutator<> {
   }
 
   int BufferSize(ir::Expr indice) {
-    auto copy = IRCopy(indice);
+    auto copy = ir::ir_utils::IRCopy(indice);
     auto vars = ir::ir_utils::CollectIRNodesInOrder(
         copy, [](const ir::Expr *expr) { return expr->As<ir::_Var_>(); });
 
@@ -598,7 +598,7 @@ class ResizeBufferSizeVisitor : public ir::IRMutator<> {
       auto extent = loop_2_extent_.find(var->name)->second;
 
       for (int idx = 0; idx < extent; ++idx) {
-        auto tmp = IRCopy(index);
+        auto tmp = ir::ir_utils::IRCopy(index);
         ReplaceVarWithExpr(&tmp, var, Expr(idx));
 
         if (deep == vars.size() - 1) {
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
index 32d4037b83e3e..1131eb68d4d1b 100644
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -94,7 +94,7 @@ struct UnrollMutator : public ir::IRMutator<Expr*> {
 
     for (int i = min->value; i < extent->value; i++) {
       Expr start = op->min + i;
-      body.push_back(optim::IRCopy(op->body));
+      body.push_back(ir::ir_utils::IRCopy(op->body));
       cinn::ir::ir_utils::IrReplace(&body.back(), op->loop_var, start);
     }
 
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 8ed13e9d5971b..4903a1466b98d 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -148,11 +148,11 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
     }
 
     // check tensor accessed sequentially by comparing index one by one
-    Expr first_idx = optim::IRCopy(indices.back());
+    Expr first_idx = ir::ir_utils::IRCopy(indices.back());
     cinn::ir::ir_utils::IrReplace(&first_idx, Expr(iter_var_), Expr(0));
     const auto &interval = var_intervals_->at(iter_var_->name);
     for (int i = 1; i < interval.r; ++i) {
-      Expr next_idx = optim::IRCopy(indices.back());
+      Expr next_idx = ir::ir_utils::IRCopy(indices.back());
       cinn::ir::ir_utils::IrReplace(&next_idx, Expr(iter_var_), Expr(i));
       auto gap = common::AutoSimplify(Expr(next_idx - first_idx));
       if (!gap.As<IntImm>() || gap.as_int32() != i) {
@@ -800,7 +800,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
         cuda_vectorizer.Visit(&new_forloop->body);
         // unroll the new forloop to compute each element of the vector
         // iteratively
-        auto copied_loop = optim::IRCopy(_new_forloop);
+        auto copied_loop = ir::ir_utils::IRCopy(_new_forloop);
         copied_loop.As<ir::For>()->set_unrolled();
         optim::UnrollLoop(&copied_loop);
         // add cast exprs of vector type in the front of vectorized forloop,
@@ -883,12 +883,13 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
           Var new_iterator_outer(
               common::UniqName(outer_for->loop_var->name + "_s"));
 
-          Expr inner_for_b = Block::Make({For::Make(new_iterator_inner,
-                                                    inner_for->min,
-                                                    b,
-                                                    ForType::Serial,
-                                                    DeviceAPI::UNK,
-                                                    IRCopy(inner_for->body))});
+          Expr inner_for_b =
+              Block::Make({For::Make(new_iterator_inner,
+                                     inner_for->min,
+                                     b,
+                                     ForType::Serial,
+                                     DeviceAPI::UNK,
+                                     ir::ir_utils::IRCopy(inner_for->body))});
           cinn::ir::ir_utils::IrReplace(
               &inner_for_b, inner_for->loop_var, Expr(new_iterator_inner));
 
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index d74bce1404e5b..01fa3bdb38fd9 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -515,7 +515,7 @@ void Stage::EditTempTensor(Stage *other, int level) {
 
   std::vector<Expr> new_shape;
   for (auto &i : this->tensor()->new_indices) {
-    new_shape.push_back(optim::IRCopy(i));
+    new_shape.push_back(ir::ir_utils::IRCopy(i));
   }
   for (auto &i : new_shape) {
     for (auto &j : dim_to_range) {
diff --git a/paddle/cinn/pybind/optim.cc b/paddle/cinn/pybind/optim.cc
index 00219477e8f85..bb1a18a2c24fe 100755
--- a/paddle/cinn/pybind/optim.cc
+++ b/paddle/cinn/pybind/optim.cc
@@ -36,13 +36,13 @@ void BindSimplify(py::module* m) {
   m->def(
       "simplify",
       [](const Expr& expr) -> Expr {
-        auto copied = optim::IRCopy(expr);
+        auto copied = ir::ir_utils::IRCopy(expr);
         Simplify(&copied);
         return copied;
       },
       py::arg("expr"));
 
-  m->def("ir_copy", py::overload_cast<Expr>(&optim::IRCopy));
+  m->def("ir_copy", py::overload_cast<Expr>(&ir::ir_utils::IRCopy));
 }
 
 }  // namespace

From a83cfb60aec1eea2c48752e52717aa5d11762c40 Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:48:20 +0800
Subject: [PATCH 063/115] [NewComm] No.13 compatiable upgrade for allreduce
 (#57233)

* [NewComm] update allreduce_op

* fix

* fix include

* fix AllReduce

* try fix

* try ci
---
 .../tensorrt/plugin/c_allreduce_op_plugin.cu  | 55 +++++++++++++++++--
 .../inference/test_trt_convert_c_allreduce.py | 15 +++++
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
index 8ec06071301c9..1033dc65f2dcc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
@@ -15,8 +15,15 @@
 #include <cstring>
 
 #include "glog/logging.h"
+#include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.h"
 #include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
+#endif
 
 namespace paddle {
 namespace inference {
@@ -175,12 +182,48 @@ int CAllReducePluginDynamic::enqueue(
       PADDLE_THROW(platform::errors::InvalidArgument("Invalid reduce type: %d",
                                                      red_type_));
   }
-
-  auto comm = platform::NCCLCommContext::Instance().Get(ring_id_);
-  cudaStream_t custream = use_calc_stream_ ? stream : comm->stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-      sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
-
+  const auto& comm_context_manager =
+      phi::distributed::CommContextManager::GetInstance();
+  if (FLAGS_dynamic_static_unified_comm) {
+    PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id_)),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "You choose to use new communication library by "
+                          "setting environment "
+                          "variable FLAGS_dynamic_static_unified_comm True. "
+                          "But ring_id(%d) is "
+                          "not found in comm_context_manager.",
+                          std::to_string(ring_id_)));
+    auto comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+        comm_context_manager.Get(std::to_string(ring_id_)));
+    PADDLE_ENFORCE_NE(comm_ctx,
+                      nullptr,
+                      platform::errors::Unavailable(
+                          "NCCLCommContext is nullptr, collective op should "
+                          "has ring_id attr."));
+    auto stream = comm_ctx->GetStream();
+    ncclRedOp_t nccl_red_type = ncclSum;
+    // comm_ctx->AllReduce(&inputs[0], inputs[0], nccl_red_type, stream);
+    phi::dynload::ncclAllReduce(sendbuff,
+                                recvbuff,
+                                numel,
+                                dtype,
+                                nccl_red_type,
+                                comm_ctx->GetNcclComm(),
+                                stream);
+    VLOG(3) << "new NCCLCommContext has ring_id_ " << ring_id_;
+  } else {
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id_);
+    cudaStream_t custream = use_calc_stream_ ? stream : comm->stream();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
+                                                                recvbuff,
+                                                                numel,
+                                                                dtype,
+                                                                nccl_red_type,
+                                                                comm->comm(),
+                                                                custream));
+    VLOG(3) << "old NCCLCommContext has ring_id_ " << ring_id_;
+  }
 #endif
   return (cudaGetLastError() != cudaSuccess);
 }
diff --git a/test/ir/inference/test_trt_convert_c_allreduce.py b/test/ir/inference/test_trt_convert_c_allreduce.py
index 6e3bc5ae9a894..0412ebb2099ef 100644
--- a/test/ir/inference/test_trt_convert_c_allreduce.py
+++ b/test/ir/inference/test_trt_convert_c_allreduce.py
@@ -43,6 +43,21 @@ def test_run(self):
             if len(results) == 2 and results[0] == "c_allreduce_out":
                 self.assertEqual(float(results[1]), self.target_value)
 
+    def test_allreduce_nccl_with_new_comm(self):
+        env = dict(os.environ)
+        env["CUDA_VISIBLE_DEVICES"] = "0,1"
+        env["FLAGS_dynamic_static_unified_comm"] = "1"
+        cmd = f"python -u -m paddle.distributed.fleet.launch --gpus 0,1 {self.script} {self.op_type} {self.precision}"
+        cmd = cmd.split(" ")
+
+        local_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env)
+
+        local_out, local_err = local_proc.communicate()
+        for line in local_out.decode("utf-8").split("\n"):
+            results = line.split("=")
+            if len(results) == 2 and results[0] == "c_allreduce_out":
+                self.assertEqual(float(results[1]), self.target_value)
+
 
 class TestMin(TestDistTRT):
     def init_case(self):

From 4a15a2e44665da53b97efe6431dc479886aa4602 Mon Sep 17 00:00:00 2001
From: Ryan <44900829+DrRyanHuang@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:56:17 +0800
Subject: [PATCH 064/115] [PIR] Replace `*ir` names of files and directories on
 the Python side with `*pir` (#57209)

---
 paddle/fluid/pybind/CMakeLists.txt            |  2 +-
 paddle/fluid/pybind/{ir.cc => pir.cc}         |  8 +++---
 paddle/fluid/pybind/{ir.h => pir.h}           |  2 +-
 paddle/fluid/pybind/pybind.cc                 |  4 +--
 python/paddle/_C_ops.py                       |  4 +--
 python/paddle/__init__.py                     |  2 +-
 python/paddle/{_ir_ops.py => _pir_ops.py}     |  4 +--
 python/paddle/autograd/ir_backward.py         | 12 ++++-----
 python/paddle/base/data_feeder.py             |  2 +-
 python/paddle/base/executor.py                | 10 +++----
 python/paddle/base/framework.py               | 12 ++++-----
 python/paddle/base/layer_helper_base.py       |  2 +-
 python/paddle/decomposition/decomp.py         | 14 +++++-----
 python/paddle/decomposition/rules.py          |  4 +--
 python/paddle/incubate/passes/ir.py           |  2 +-
 python/paddle/jit/dy2static/function_spec.py  |  2 +-
 .../jit/dy2static/newir_partial_program.py    |  8 +++---
 .../jit/dy2static/program_translator.py       |  2 +-
 python/paddle/nn/functional/activation.py     |  2 +-
 python/paddle/nn/functional/common.py         |  8 +++---
 python/paddle/nn/initializer/constant.py      |  6 ++---
 python/paddle/nn/initializer/xavier.py        |  6 ++---
 python/paddle/nn/layer/layers.py              |  2 +-
 python/paddle/optimizer/optimizer.py          |  8 +++---
 python/paddle/{ir => pir}/__init__.py         |  4 +--
 python/paddle/{ir => pir}/core.py             |  4 +--
 python/paddle/pir_utils.py                    | 14 +++++-----
 python/paddle/static/input.py                 | 10 +++----
 python/paddle/tensor/manipulation.py          |  6 ++---
 python/paddle/tensor/math.py                  |  6 ++---
 python/setup.py.in                            |  6 ++---
 setup.py                                      |  2 +-
 test/ir/new_ir/test_build_op.py               | 16 ++++++------
 test/ir/new_ir/test_ir_backward.py            | 22 ++++++++--------
 test/ir/new_ir/test_ir_pybind.py              |  8 +++---
 test/ir/new_ir/test_ir_vjp.py                 | 18 ++++++-------
 test/ir/new_ir/test_pass_manager.py           |  6 ++---
 test/ir/new_ir/test_special_op_translator.py  | 26 +++++++++----------
 test/ir/new_ir/test_symbol_overload.py        | 20 +++++++-------
 test/ir/test_op_input_grad_semantic.py        |  6 ++---
 test/legacy_test/op_test.py                   | 20 +++++++-------
 test/legacy_test/test_fill_any_like_op.py     |  4 +--
 test/legacy_test/test_full_like_op.py         |  4 +--
 test/legacy_test/test_reduce_op.py            |  2 +-
 .../prim/new_ir_prim/test_custom_vjp_trait.py |  6 ++---
 test/prim/new_ir_prim/test_decomp_op.py       |  4 +--
 test/prim/new_ir_prim/test_prim_custom_vjp.py |  8 +++---
 test/prim/new_ir_prim/test_prim_simpnet.py    |  8 +++---
 test/prim/new_ir_prim/test_vjp_prim.py        | 14 +++++-----
 49 files changed, 186 insertions(+), 186 deletions(-)
 rename paddle/fluid/pybind/{ir.cc => pir.cc} (99%)
 rename paddle/fluid/pybind/{ir.h => pir.h} (95%)
 rename python/paddle/{_ir_ops.py => _pir_ops.py} (89%)
 rename python/paddle/{ir => pir}/__init__.py (92%)
 rename python/paddle/{ir => pir}/core.py (98%)

diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 34ec9ca0523ee..22e48b0b6a075 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -116,7 +116,7 @@ set(PYBIND_SRCS
     inference_api.cc
     ops_api.cc
     static_op_function.cc
-    ir.cc
+    pir.cc
     graph.cc
     bind_fleet_executor.cc
     reader_py.cc
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/pir.cc
similarity index 99%
rename from paddle/fluid/pybind/ir.cc
rename to paddle/fluid/pybind/pir.cc
index e5e237bf7fe34..1eafe39eb19a5 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/pir.h"
 #include <Python.h>
 #include <algorithm>
 #include <memory>
@@ -967,7 +967,7 @@ void BindUtils(pybind11::module *m) {
             .. code-block:: python
 
                 import paddle
-                from paddle import ir
+                from paddle import pir
                 paddle.enable_static()
 
                 x = paddle.randn([4, 4])
@@ -1049,8 +1049,8 @@ void BindPassManager(pybind11::module *m) {
       .def("empty", &PassManager::Empty);
 }
 
-void BindNewIR(pybind11::module *module) {
-  auto ir_module = module->def_submodule("ir");
+void BindPIR(pybind11::module *module) {
+  auto ir_module = module->def_submodule("pir");
   BindProgram(&ir_module);
   BindBlock(&ir_module);
   BindOperation(&ir_module);
diff --git a/paddle/fluid/pybind/ir.h b/paddle/fluid/pybind/pir.h
similarity index 95%
rename from paddle/fluid/pybind/ir.h
rename to paddle/fluid/pybind/pir.h
index 7732cccbd5ee3..b64de63452f40 100644
--- a/paddle/fluid/pybind/ir.h
+++ b/paddle/fluid/pybind/pir.h
@@ -18,6 +18,6 @@
 
 namespace paddle {
 namespace pybind {
-void BindNewIR(pybind11::module *m);
+void BindPIR(pybind11::module *m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 8e64d26b59b68..0d2c446349e8b 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -122,9 +122,9 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/io.h"
-#include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/jit.h"
 #include "paddle/fluid/pybind/metrics_py.h"
+#include "paddle/fluid/pybind/pir.h"
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_variant_caster.h"
 #include "paddle/fluid/pybind/xpu_streams_py.h"
@@ -2939,7 +2939,7 @@ All parameter, weight, gradient are variables in Paddle.
   GetAllWorkerInfos(&m);
 #endif
 
-  BindNewIR(&m);
+  BindPIR(&m);
   BindVjp(&m);
 }
 }  // namespace pybind
diff --git a/python/paddle/_C_ops.py b/python/paddle/_C_ops.py
index 987a38d36b9a6..194de126cdc89 100644
--- a/python/paddle/_C_ops.py
+++ b/python/paddle/_C_ops.py
@@ -21,7 +21,7 @@
     globals()[name] = getattr(core.eager.ops, name)
     __all__.append(name)
 
-for name in dir(core.ir.ops):
-    globals()[name] = getattr(core.ir.ops, name)
+for name in dir(core.pir.ops):
+    globals()[name] = getattr(core.pir.ops, name)
     if name not in __all__:
         __all__.append(name)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 0c168b44c4a64..f0800fbaf44c6 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -464,7 +464,7 @@
 from . import linalg  # noqa: F401
 from . import fft  # noqa: F401
 from . import signal  # noqa: F401
-from . import _ir_ops  # noqa: F401
+from . import _pir_ops  # noqa: F401
 
 import paddle.text  # noqa: F401
 import paddle.vision  # noqa: F401
diff --git a/python/paddle/_ir_ops.py b/python/paddle/_pir_ops.py
similarity index 89%
rename from python/paddle/_ir_ops.py
rename to python/paddle/_pir_ops.py
index b29784505d1f2..b48d89b369e38 100644
--- a/python/paddle/_ir_ops.py
+++ b/python/paddle/_pir_ops.py
@@ -16,6 +16,6 @@
 
 __all__ = []
 
-for name in dir(core.ir.ops):
-    globals()[name] = getattr(core.ir.ops, name)
+for name in dir(core.pir.ops):
+    globals()[name] = getattr(core.pir.ops, name)
     __all__.append(name)
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index bb73c1d670cee..97a315c101056 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -15,7 +15,7 @@
 import collections
 from collections.abc import Sequence
 
-import paddle.ir
+import paddle.pir
 from paddle.autograd.backward_utils import State
 
 """
@@ -158,7 +158,7 @@ def some_in_set(value_list, value_set):
     def operand2value(values):
         value_set = set()
         for item in values:
-            if isinstance(item, paddle.ir.OpOperand):
+            if isinstance(item, paddle.pir.OpOperand):
                 value_set.add(item.source())
             else:
                 value_set.add(item)
@@ -747,26 +747,26 @@ def grad(
     check_type(
         outputs,
         'outputs',
-        ((paddle.ir.Value, paddle.ir.OpResult), list, tuple),
+        ((paddle.pir.Value, paddle.pir.OpResult), list, tuple),
         'paddle.autograd.ir_backward.grad',
     )
     check_type(
         inputs,
         'inputs',
-        ((paddle.ir.Value, paddle.ir.OpResult), list, tuple),
+        ((paddle.pir.Value, paddle.pir.OpResult), list, tuple),
         'paddle.autograd.ir_backward.grad',
     )
     check_type(
         grad_outputs,
         'grad_outputs',
-        ((paddle.ir.Value, paddle.ir.OpResult), list, tuple, type(None)),
+        ((paddle.pir.Value, paddle.pir.OpResult), list, tuple, type(None)),
         'paddle.autograd.ir_backward.grad',
     )
 
     check_type(
         no_grad_vars,
         'no_grad_vars',
-        ((paddle.ir.Value, paddle.ir.OpResult), list, tuple, set, type(None)),
+        ((paddle.pir.Value, paddle.pir.OpResult), list, tuple, set, type(None)),
         'paddle.autograd.ir_backward.grad',
     )
     outputs = _as_list(outputs)
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index 52ed983ffa729..a641e8be1275f 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from ..ir import OpResult
+from ..pir import OpResult
 from . import core
 from .framework import (
     Variable,
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 9fd421f54cb4c..17f119d2e8532 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -21,7 +21,7 @@
 
 import numpy as np
 
-from ..ir import OpResult
+from ..pir import OpResult
 from . import compiler, core, framework, get_flags, set_flags, unique_name
 from .data_feeder import convert_dtype
 from .framework import (
@@ -511,7 +511,7 @@ def _add_pir_fetch_ops(program, fetch_list, fetch_var_name):
                 assert isinstance(
                     fetch_input, OpResult
                 ), "Wrong type for fetch_list[%s]: %s" % (i, type(fetch_input))
-                paddle._ir_ops.fetch(fetch_input, fetch_var_name + str(i), i)
+                paddle._pir_ops.fetch(fetch_input, fetch_var_name + str(i), i)
 
 
 def _merge_tensors(tensor, micro_batch_num):
@@ -1246,7 +1246,7 @@ def _pir_feed_data(self, program, feed, scope):
                 pir_check_feed_shape_type(
                     cur_feed, feed_target_name, var_shape, var_type
                 )
-                # the last arg of set_feed_variable has no effect in new ir, we pass 0 by default.
+                # the last arg of set_feed_variable has no effect in pir, we pass 0 by default.
                 core.set_feed_variable(scope, cur_feed, feed_target_name, 0)
             else:
                 break
@@ -1869,8 +1869,8 @@ def _run_pir_impl(
     ):
         import paddle
 
-        Program = paddle.ir.Program
-        default_main_program = paddle.ir.core.default_main_program
+        Program = paddle.pir.Program
+        default_main_program = paddle.pir.core.default_main_program
 
         if self._closed:
             raise RuntimeError("Attempted to use a closed Executor")
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 3c99297c20875..12fd05ca8d8c7 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -32,7 +32,7 @@
 
 from . import core
 from . import unique_name
-from .. import ir
+from .. import pir
 from paddle.base.libpaddle import DataType
 import paddle.version as fluid_version
 import warnings
@@ -294,10 +294,10 @@ def in_dygraph_mode():
 def in_pir_mode():
     """
 
-    This API checks whether paddle runs in static graph mode and use new ir api.
+    This API checks whether paddle runs in static graph mode and use pir api.
 
     Returns:
-        bool: Whether paddle runs in static graph mode and use new ir api.
+        bool: Whether paddle runs in static graph mode and use pir api.
 
     Examples:
         .. code-block:: python
@@ -323,10 +323,10 @@ def use_pir_api():
 def in_dynamic_or_pir_mode():
     """
 
-    This API checks whether paddle runs in dynamic graph or new ir mode.
+    This API checks whether paddle runs in dynamic graph or pir mode.
 
     Returns:
-        bool: Whether paddle runs in static graph mode and use new ir api.
+        bool: Whether paddle runs in static graph mode and use pir api.
 
     Examples:
         .. code-block:: python
@@ -1162,7 +1162,7 @@ def convert_np_dtype_to_dtype_(np_dtype):
 
     """
     if in_pir_mode():
-        return ir.core.convert_np_dtype_to_dtype_(np_dtype)
+        return pir.core.convert_np_dtype_to_dtype_(np_dtype)
 
     # Convert the data type string to numpy data type.
     if isinstance(np_dtype, str) and np_dtype == "bfloat16":
diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py
index 51680a1abbc4e..87e90787931d4 100644
--- a/python/paddle/base/layer_helper_base.py
+++ b/python/paddle/base/layer_helper_base.py
@@ -434,7 +434,7 @@ def create_parameter(
             )
         else:
             if in_pir_mode():
-                return paddle.ir.core.create_parameter(
+                return paddle.pir.core.create_parameter(
                     dtype=dtype,
                     shape=shape,
                     **attr._to_kwargs(with_initializer=True),
diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py
index 9ae9178f45268..57e641e9c36a8 100644
--- a/python/paddle/decomposition/decomp.py
+++ b/python/paddle/decomposition/decomp.py
@@ -15,15 +15,15 @@
 import logging
 import typing
 
-from paddle import ir
-from paddle.base.libpaddle.ir import Block, Program
+from paddle import pir
+from paddle.base.libpaddle.pir import Block, Program
 from paddle.framework import core
 
 from . import register
 
 
 def _build_tensor_tuple(xs):
-    if isinstance(xs, ir.OpResult):
+    if isinstance(xs, pir.OpResult):
         return (xs,)
     elif isinstance(xs, typing.Sequence):
         return tuple(xs)
@@ -157,12 +157,12 @@ def decompose(
     dst_vars = [None] * len(src_vars)
     dst_vars_dct = {}
     for idx, item in enumerate(src_vars):
-        if not isinstance(item, ir.OpResult):
+        if not isinstance(item, pir.OpResult):
             raise TypeError(
                 f"Each var in dst_vars should map corresponding var in src_vars, but got type {type(item)} in {src_vars}."
             )
         dst_vars_dct[item] = idx
-    with ir.core.program_guard(program):
+    with pir.core.program_guard(program):
         _decompose_subgraph(
             block,
             dst_vars_dct,
@@ -170,7 +170,7 @@ def decompose(
             op_filter,
         )
     for idx, item in enumerate(dst_vars):
-        if not isinstance(item, ir.OpResult):
+        if not isinstance(item, pir.OpResult):
             if item is None:
                 dst_vars[idx] = src_vars[idx]
             else:
@@ -206,7 +206,7 @@ def _decompose_subgraph(block, orig_vars, dst_vars, op_filter):
             if lower:
                 core.prim_config["composite_ops_record"].add(op_name)
                 input_args = _prepare_python_api_arguments(op)
-                ir.set_insertion_point(op)
+                pir.set_insertion_point(op)
                 orig_outs = op.results()
                 new_outs = _build_tensor_tuple(decom_rule(*input_args))
 
diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py
index 26a4ae73debd0..173e9fe87f23f 100644
--- a/python/paddle/decomposition/rules.py
+++ b/python/paddle/decomposition/rules.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import _ir_ops
+from paddle import _pir_ops
 
 from .primitives import *  # noqa: F403
 from .register import register_decomp
@@ -60,7 +60,7 @@ def gelu_composite(x, approximate):
     else:
         # gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
 
-        cdf = half * (one + _ir_ops.erf(x * full(x.shape, M_SQRT1_2, x.dtype)))
+        cdf = half * (one + _pir_ops.erf(x * full(x.shape, M_SQRT1_2, x.dtype)))
         out = x * cdf
         return out
 
diff --git a/python/paddle/incubate/passes/ir.py b/python/paddle/incubate/passes/ir.py
index 1b960855ef46c..f46cd9851c9de 100644
--- a/python/paddle/incubate/passes/ir.py
+++ b/python/paddle/incubate/passes/ir.py
@@ -459,7 +459,7 @@ def RegisterPass(function=None, input_specs={}):
         .. code-block:: python
 
             >>> import paddle
-            >>> from paddle.base.ir import RegisterPass
+            >>> from paddle.incubate.passes.ir import RegisterPass
 
             >>> @RegisterPass
             >>> def multi_add_to_addn():
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index ec835ee6e9540..f302dd4e2c6ca 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 import paddle
-import paddle.ir.core as ir_static
+import paddle.pir.core as ir_static
 from paddle.base import core
 from paddle.base.data_feeder import convert_dtype
 from paddle.base.dygraph.base import switch_to_static_graph
diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py
index c0da8f35c822a..ba73811e68ee9 100644
--- a/python/paddle/jit/dy2static/newir_partial_program.py
+++ b/python/paddle/jit/dy2static/newir_partial_program.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 import paddle
-import paddle.ir.core as ir_static
+import paddle.pir.core as ir_static
 from paddle import _legacy_C_ops
 from paddle.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard
 from paddle.autograd.ir_backward import grad
@@ -27,7 +27,7 @@
 from paddle.base.data_feeder import check_type, convert_dtype
 from paddle.base.dygraph.base import switch_to_static_graph
 from paddle.base.framework import _apply_pass
-from paddle.base.libpaddle.ir import OpResult, fake_op_result
+from paddle.base.libpaddle.pir import OpResult, fake_op_result
 from paddle.framework import use_pir_api
 from paddle.optimizer.lr import LRScheduler
 
@@ -823,7 +823,7 @@ def _get_forward_backward_program_form(
         (
             forward_program,
             backward_program,
-        ), program_attr = paddle.base.libpaddle.ir.program_split(
+        ), program_attr = paddle.base.libpaddle.pir.program_split(
             whole_program,
             forward_inputs,
             forward_outputs,
@@ -1140,7 +1140,7 @@ def partial_program_from(concrete_program, from_method=False):
 def add_build_strategy_for(
     program, start_op_index, end_op_index, build_strategy=None, skip_vars=None
 ):
-    paddle.base.libpaddle.ir.program_split(
+    paddle.base.libpaddle.pir.program_split(
         program,
     )
     if start_op_index < end_op_index:
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 8eb118852a764..5f8756ce7f150 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -20,7 +20,7 @@
 import warnings
 import weakref
 
-import paddle.ir.core as ir_static
+import paddle.pir.core as ir_static
 from paddle import decomposition
 from paddle.base import core, framework
 from paddle.base.data_feeder import check_type
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index c9bc9cb0ebebc..dcb0dfdded2b5 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -759,7 +759,7 @@ def relu(x, name=None):
     else:
         if paddle.framework.in_dynamic_or_pir_mode():
             # Below code will be removed after we can generate IR api automatically
-            return paddle._ir_ops.relu(x)
+            return paddle._pir_ops.relu(x)
 
         check_variable_and_dtype(
             x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'relu'
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index de606db3c39d1..ea327e17e838d 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -15,7 +15,7 @@
 import numpy
 
 import paddle
-from paddle import _C_ops, ir
+from paddle import _C_ops, pir
 from paddle.base.layer_helper import LayerHelper
 from paddle.common_ops_import import Variable, default_main_program
 from paddle.framework import (
@@ -1099,7 +1099,7 @@ def dropout(
             [[0., 0., 6.],
              [0., 0., 0.]])
     """
-    if not isinstance(p, (float, int, Variable, ir.OpResult)):
+    if not isinstance(p, (float, int, Variable, pir.OpResult)):
         raise TypeError("p argument should be a number or Variable")
 
     if isinstance(p, (int, float)):
@@ -1944,9 +1944,9 @@ def linear(x, weight, bias=None, name=None):
         return _C_ops.linear(x, weight, bias)
 
     elif in_pir_mode():
-        out = paddle._ir_ops.matmul(x, weight, False, False)
+        out = paddle._pir_ops.matmul(x, weight, False, False)
         if bias is not None:
-            return paddle._ir_ops.add(out, bias)
+            return paddle._pir_ops.add(out, bias)
         else:
             return out
     else:
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index b4e9ee1df266a..5391142d503c3 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -61,11 +61,11 @@ def forward(self, var, block=None):
             (
                 framework.Variable,
                 framework.EagerParamBase,
-                paddle.ir.OpResult,
-                paddle.ir.core.ParameterMeta,
+                paddle.pir.OpResult,
+                paddle.pir.core.ParameterMeta,
             ),
         )
-        assert isinstance(block, (framework.Block, paddle.ir.Block))
+        assert isinstance(block, (framework.Block, paddle.pir.Block))
 
         if in_dynamic_or_pir_mode():
             place = _current_expected_place()
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 40eb6a874c9da..97ff5312f7deb 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -87,8 +87,8 @@ def forward(self, var, block=None):
         import paddle
 
         block = self._check_block(block)
-        assert isinstance(block, (framework.Block, paddle.ir.Block))
-        if not isinstance(var, paddle.ir.core.ParameterMeta):
+        assert isinstance(block, (framework.Block, paddle.pir.Block))
+        if not isinstance(var, paddle.pir.core.ParameterMeta):
             check_variable_and_dtype(
                 var,
                 "Out",
@@ -153,7 +153,7 @@ def forward(self, var, block=None):
         elif in_pir_mode():
             if self._uniform:
                 limit = math.sqrt(6.0 / float(fan_in + fan_out))
-                return paddle._ir_ops.uniform(
+                return paddle._pir_ops.uniform(
                     var.shape,
                     var.dtype,
                     -limit,
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index 9f4d7d037cf7f..204023378b5d3 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -1591,7 +1591,7 @@ def _remove_if_exist(*dicts):
 
             _remove_if_exist(self.__dict__, self._buffers, self._sub_layers)
             params[name] = value
-        elif isinstance(value, paddle.ir.OpResult) and value.is_persistable:
+        elif isinstance(value, paddle.pir.OpResult) and value.is_persistable:
             if params is None:
                 raise ValueError("super().__init__() should be called first")
             _remove_if_exist(self.__dict__, self._buffers, self._sub_layers)
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index cc758a4d2159d..d529b605d8447 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -468,19 +468,19 @@ def do_create():
                 # only create global lr_var once
                 lr = self._global_learning_rate()
                 if in_pir_mode():
-                    if isinstance(lr, paddle.ir.OpResult):
+                    if isinstance(lr, paddle.pir.OpResult):
                         return
                     else:
                         place = _current_expected_place()
                         if not isinstance(_lr_dtype, paddle.base.core.DataType):
                             lr_dtype = (
-                                paddle.ir.core.convert_np_dtype_to_dtype_(
+                                paddle.pir.core.convert_np_dtype_to_dtype_(
                                     _lr_dtype
                                 )
                             )
                         self._learning_rate_map[
                             framework.default_main_program()
-                        ] = paddle._ir_ops.full(
+                        ] = paddle._pir_ops.full(
                             [],
                             self._learning_rate,
                             _lr_dtype,
@@ -1640,7 +1640,7 @@ def minimize(
 
         """
         assert isinstance(
-            loss, (Variable, paddle.ir.OpResult)
+            loss, (Variable, paddle.pir.OpResult)
         ), "The loss should be an Tensor."
 
         parameter_list = parameters if parameters else self._parameter_list
diff --git a/python/paddle/ir/__init__.py b/python/paddle/pir/__init__.py
similarity index 92%
rename from python/paddle/ir/__init__.py
rename to python/paddle/pir/__init__.py
index df98fd79fe2a3..07588983d64e4 100644
--- a/python/paddle/ir/__init__.py
+++ b/python/paddle/pir/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.base.libpaddle.ir import (
+from paddle.base.libpaddle.pir import (
     Program,
     Block,
     Operation,
@@ -21,7 +21,7 @@
     OpResult,
     Type,
 )  # noqa: F401
-from paddle.base.libpaddle.ir import (
+from paddle.base.libpaddle.pir import (
     translate_to_new_ir,
     set_global_program,
     set_insertion_point,
diff --git a/python/paddle/ir/core.py b/python/paddle/pir/core.py
similarity index 98%
rename from python/paddle/ir/core.py
rename to python/paddle/pir/core.py
index 908319458ed39..51a661186cf4f 100644
--- a/python/paddle/ir/core.py
+++ b/python/paddle/pir/core.py
@@ -17,9 +17,9 @@
 
 from paddle.base.core import VarDesc
 from paddle.base.libpaddle import DataType
-from paddle.base.libpaddle.ir import Program, set_global_program
+from paddle.base.libpaddle.pir import Program, set_global_program
 
-from .._ir_ops import get_parameter, set_parameter
+from .._pir_ops import get_parameter, set_parameter
 from ..base import unique_name
 from ..base.wrapped_decorator import signature_safe_contextmanager
 
diff --git a/python/paddle/pir_utils.py b/python/paddle/pir_utils.py
index 9af825cfcd88b..28d261b0155fc 100644
--- a/python/paddle/pir_utils.py
+++ b/python/paddle/pir_utils.py
@@ -63,16 +63,16 @@ def _switch_to_pir(self):
             paddle.framework.set_flags(
                 {"FLAGS_enable_new_ir_in_executor": True}
             )
-            paddle.ir.register_paddle_dialect()
-            paddle.static.Program = paddle.ir.Program
-            paddle.base.Program = paddle.ir.Program
-            paddle.base.program_guard = paddle.ir.core.program_guard
-            paddle.static.program_guard = paddle.ir.core.program_guard
+            paddle.pir.register_paddle_dialect()
+            paddle.static.Program = paddle.pir.Program
+            paddle.base.Program = paddle.pir.Program
+            paddle.base.program_guard = paddle.pir.core.program_guard
+            paddle.static.program_guard = paddle.pir.core.program_guard
             paddle.static.default_main_program = (
-                paddle.ir.core.default_main_program
+                paddle.pir.core.default_main_program
             )
             paddle.static.default_startup_program = (
-                paddle.ir.core.default_startup_program
+                paddle.pir.core.default_startup_program
             )
 
     def _switch_to_old_ir(self):
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index 15694fe34f147..412defb7d6ea2 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -104,13 +104,13 @@ def data(name, shape, dtype=None, lod_level=0):
     """
 
     def _reset_data_op_insertion_point():
-        default_main_program = paddle.ir.core.default_main_program()
+        default_main_program = paddle.pir.core.default_main_program()
         ops = default_main_program.global_block().ops
         if len(ops) == 0:
             return
         for op in ops:
             if op.name() != 'pd_op.data':
-                paddle.ir.set_insertion_point(op)
+                paddle.pir.set_insertion_point(op)
                 return
 
     helper = LayerHelper('data', **locals())
@@ -126,10 +126,10 @@ def _reset_data_op_insertion_point():
         dtype = paddle.get_default_dtype()
 
     if in_pir_mode():
-        ir_dtype = paddle.ir.core.convert_np_dtype_to_dtype_(dtype)
+        ir_dtype = paddle.pir.core.convert_np_dtype_to_dtype_(dtype)
         _reset_data_op_insertion_point()
-        out = paddle._ir_ops.data(name, shape, ir_dtype, core.Place())
-        paddle.ir.reset_insertion_point_to_end()
+        out = paddle._pir_ops.data(name, shape, ir_dtype, core.Place())
+        paddle.pir.reset_insertion_point_to_end()
         return out
 
     out = helper.create_global_variable(
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 777a47968e591..7fc8dc49b8c3a 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1125,7 +1125,7 @@ def concat(x, axis=0, name=None):
     if in_dynamic_or_pir_mode():
         if isinstance(axis, Variable):
             axis = axis.item(0)
-        if not isinstance(input, (Variable, paddle.ir.Value)):
+        if not isinstance(input, (Variable, paddle.pir.Value)):
             input = [t for t in input if t.shape.count(0) == 0]
         return _C_ops.concat(input, axis)
     else:
@@ -3586,7 +3586,7 @@ def reshape(x, shape, name=None):
             for ele in shape:
                 if isinstance(ele, core.eager.Tensor):
                     new_shape.append(ele.item())
-                elif isinstance(ele, paddle.ir.OpResult):
+                elif isinstance(ele, paddle.pir.OpResult):
                     new_shape.append(-1)
                 else:
                     new_shape.append(ele)
@@ -3595,7 +3595,7 @@ def reshape(x, shape, name=None):
                 out = x
             else:
                 out = _C_ops.reshape(x, new_shape)
-        elif isinstance(shape, (core.eager.Tensor, paddle.ir.OpResult)):
+        elif isinstance(shape, (core.eager.Tensor, paddle.pir.OpResult)):
             shape.stop_gradient = True
             out = _C_ops.reshape(x, shape)
         else:
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index d772c890fbb8c..fd11afc85935c 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -270,7 +270,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     elif in_pir_mode():
         if act is None:
             return _C_ops.scale(x, scale, float(bias), bias_after_scale)
-        raise ValueError("act is not implement in new ir of scale api.")
+        raise ValueError("act is not implement in pir of scale api.")
     else:
         check_variable_and_dtype(
             x,
@@ -503,7 +503,7 @@ def pow(x, y, name=None):
     if in_dynamic_or_pir_mode():
         if isinstance(y, (int, float)):
             return _C_ops.pow(x, y)
-        elif isinstance(y, (paddle.Tensor, Variable, paddle.ir.OpResult)):
+        elif isinstance(y, (paddle.Tensor, Variable, paddle.pir.OpResult)):
             return _C_ops.elementwise_pow(x, y)
         else:
             raise TypeError(
@@ -691,7 +691,7 @@ def add(x, y, name=None):
         return _C_ops.add(x, y)
     else:
         if in_pir_mode():
-            return paddle._ir_ops.add(x, y)
+            return paddle._pir_ops.add(x, y)
         return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
 
diff --git a/python/setup.py.in b/python/setup.py.in
index a90efeabbdccc..39d256306bf9a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -509,7 +509,7 @@ packages=['paddle',
           'paddle.geometric',
           'paddle.geometric.message_passing',
           'paddle.geometric.sampling',
-          'paddle.ir',
+          'paddle.pir',
           'paddle.decomposition',
           ]
 
@@ -728,7 +728,7 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
                 # change rpath of phi.ext for loading 3rd party libb
                 commands.append("install_name_tool -add_rpath '@loader_path' ${PADDLE_BINARY_DIR}/python/paddle/libs/${PHI_NAME}")
             if('${WITH_SHARED_IR}' == 'ON'):
-                # change rpath of ir.ext for loading 3rd party libb
+                # change rpath of pir.ext for loading 3rd party libb
                 commands.append("install_name_tool -add_rpath '@loader_path' ${PADDLE_BINARY_DIR}/python/paddle/libs/${IR_NAME}")
         else:
             commands = ["patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/base/${FLUID_CORE_NAME}" + '.so']
@@ -736,7 +736,7 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
                 # change rpath of phi.ext for loading 3rd party lib
                 commands.append("patchelf --set-rpath '$ORIGIN' ${PADDLE_BINARY_DIR}/python/paddle/libs/${PHI_NAME}")
             if('${WITH_SHARED_IR}' == 'ON'):
-                # change rpath of ir.ext for loading 3rd party lib
+                # change rpath of pir.ext for loading 3rd party lib
                 commands.append("patchelf --set-rpath '$ORIGIN' ${PADDLE_BINARY_DIR}/python/paddle/libs/${IR_NAME}")
         # The sw_64 not suppot patchelf, so we just disable that.
         if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
diff --git a/setup.py b/setup.py
index 12cd54428d2e4..a507f3271a9d6 100644
--- a/setup.py
+++ b/setup.py
@@ -1504,7 +1504,7 @@ def get_setup_parameters():
         'paddle.geometric',
         'paddle.geometric.message_passing',
         'paddle.geometric.sampling',
-        'paddle.ir',
+        'paddle.pir',
         'paddle.decomposition',
     ]
 
diff --git a/test/ir/new_ir/test_build_op.py b/test/ir/new_ir/test_build_op.py
index 72eec04749703..68a6ce35c2bb7 100644
--- a/test/ir/new_ir/test_build_op.py
+++ b/test/ir/new_ir/test_build_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir
+from paddle import pir
 
 paddle.enable_static()
 
@@ -33,7 +33,7 @@ def get_ir_program():
         y_s = paddle.matmul(x_s, x_s)
         y_s = paddle.add(x_s, y_s)
         y_s = paddle.tanh(y_s)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -41,7 +41,7 @@ class TestBuildOp(unittest.TestCase):
     def test_build_mean_op(self):
         newir_program = get_ir_program()
         tanh_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.mean(tanh_out)
@@ -60,7 +60,7 @@ class TestBuildOp2(unittest.TestCase):
     def test_build_add_n_op(self):
         newir_program = get_ir_program()
         tanh_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out1 = paddle.mean(tanh_out)
@@ -86,8 +86,8 @@ def test_insertion_point(self):
             add_out = add_op.result(0)
             tanh_operand = tanh_op.operands()[0]
 
-            with paddle.ir.core.program_guard(newir_program):
-                ir.set_insertion_point(tanh_op)
+            with paddle.pir.core.program_guard(newir_program):
+                pir.set_insertion_point(tanh_op)
                 full_out = paddle.tensor.fill_constant(
                     shape=[4, 4], dtype="float", value=2
                 )
@@ -106,7 +106,7 @@ class TestBuildOp4(unittest.TestCase):
     def test_build_concat_op(self):
         newir_program = get_ir_program()
         tanh_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.concat([tanh_out, tanh_out], 0)
@@ -125,7 +125,7 @@ class TestBuildOp5(unittest.TestCase):
     def test_build_split_op(self):
         newir_program = get_ir_program()
         tanh_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.split(tanh_out, [2, 2], 0)
diff --git a/test/ir/new_ir/test_ir_backward.py b/test/ir/new_ir/test_ir_backward.py
index c604290d34cad..e6994d1b0dbef 100644
--- a/test/ir/new_ir/test_ir_backward.py
+++ b/test/ir/new_ir/test_ir_backward.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir
+from paddle import pir
 from paddle.autograd.ir_backward import grad
 
 paddle.enable_static()
@@ -32,7 +32,7 @@ def get_ir_program_0():
         x_s = paddle.static.data('x', [4, 4], x.dtype)
         x_s.stop_gradient = False
         k_s = paddle.tanh(x_s)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -44,7 +44,7 @@ def test_grad(self):
         newir_program = get_ir_program_0()
         input = newir_program.global_block().ops[-1].operand(0).source()
         tanh_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.mean(tanh_out)
@@ -69,7 +69,7 @@ def test_full(self):
         newir_program = get_ir_program_0()
         input = newir_program.global_block().ops[-1].operand(0).source()
         tanh_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.mean(tanh_out)
@@ -95,7 +95,7 @@ def test_no_grad_set(self):
         newir_program = get_ir_program_0()
         input = newir_program.global_block().ops[-1].operand(0).source()
         tanh_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.mean(tanh_out)
@@ -109,7 +109,7 @@ def test_split(self):
         newir_program = get_ir_program_0()
         input = newir_program.global_block().ops[-1].operand(0).source()
         tanh_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.split(tanh_out, [2, 2], 0)
@@ -149,7 +149,7 @@ def get_ir_program_1():
         k_s = paddle.tanh(x_s)
         z_x = paddle.tanh(x_s)
         out = paddle.add(z_x, k_s)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -162,7 +162,7 @@ def test_add_n(self):
         input_x = newir_program.global_block().ops[-3].operand(0).source()
 
         add_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.mean(add_out)
@@ -183,7 +183,7 @@ def test_concat(self):
         input_x = newir_program.global_block().ops[-3].operand(0).source()
 
         add_out = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             out = paddle.concat([add_out, add_out])
@@ -225,7 +225,7 @@ def get_ir_program_2():
         x_s = paddle.static.data('x', [4, 4], x.dtype)
         x_s.stop_gradient = False
         k_s = paddle.sum(x_s, axis=(-1,), keepdim=False)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -237,7 +237,7 @@ def test_basic_network(self):
         newir_program = get_ir_program_2()
         x = newir_program.global_block().ops[-1].operand(0).source()
         sum_x = newir_program.global_block().ops[-1].result(0)
-        with paddle.pir_utils.IrGuard(), paddle.ir.core.program_guard(
+        with paddle.pir_utils.IrGuard(), paddle.pir.core.program_guard(
             newir_program
         ):
             norm = paddle.tensor.fill_constant(
diff --git a/test/ir/new_ir/test_ir_pybind.py b/test/ir/new_ir/test_ir_pybind.py
index b9a6fb92ac548..2aaad209d36f7 100644
--- a/test/ir/new_ir/test_ir_pybind.py
+++ b/test/ir/new_ir/test_ir_pybind.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir
+from paddle import pir
 
 paddle.enable_static()
 
@@ -32,7 +32,7 @@ def get_ir_program():
         y_s = paddle.matmul(x_s, x_s)
         z_s = paddle.add(y_s, y_s)
         k_s = paddle.tanh(z_s)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -52,7 +52,7 @@ def test_block(self):
         ops = block.ops
         self.assertEqual(
             len(ops), 4
-        )  # ir program add "builtin.get_parameter" by default, so size is 4
+        )  # pir program add "builtin.get_parameter" by default, so size is 4
         block.remove_op(ops[3])
         self.assertEqual(len(block.ops), 3)
 
@@ -159,7 +159,7 @@ def test_attr(self):
                 shape=[4, 4], dtype="float32", value=2
             )
 
-        newir_program = ir.translate_to_new_ir(main_program.desc)
+        newir_program = pir.translate_to_new_ir(main_program.desc)
         print(newir_program)
         conv_attr = newir_program.global_block().ops[3].attrs()
         full_attr = newir_program.global_block().ops[8].attrs()
diff --git a/test/ir/new_ir/test_ir_vjp.py b/test/ir/new_ir/test_ir_vjp.py
index e3d2fc4d1446a..d0e630fccff72 100644
--- a/test/ir/new_ir/test_ir_vjp.py
+++ b/test/ir/new_ir/test_ir_vjp.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir
+from paddle import pir
 from paddle.base.core import call_vjp, has_vjp
 
 paddle.enable_static()
@@ -31,7 +31,7 @@ def get_ir_program():
         x.stop_gradient = False
         paddle.tanh(x)
         paddle.tensor.fill_constant(shape=[4, 4], dtype='float32', value=2.0)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -42,7 +42,7 @@ def test_tanh_vjp1(self):
         fill_constant_op = newir_program.global_block().ops[-1]
         out_grads = [[fill_constant_op.result(0)]]
         stop_gradients = [[False]]
-        with paddle.ir.core.program_guard(newir_program):
+        with paddle.pir.core.program_guard(newir_program):
             grad_outs = call_vjp(tanh_op, out_grads, stop_gradients)
         self.assertEqual(
             grad_outs[0][0].get_defining_op().name(), "pd_op.tanh_grad"
@@ -73,7 +73,7 @@ def test_tanh_vjp2(self):
         fill_constant_op = newir_program.global_block().ops[-1]
         out_grads = [[fill_constant_op.result(0)]]
         stop_gradients = [[True]]
-        with paddle.ir.core.program_guard(newir_program):
+        with paddle.pir.core.program_guard(newir_program):
             grad_outs = call_vjp(tanh_op, out_grads, stop_gradients)
         self.assertEqual(grad_outs[0][0], None)
 
@@ -89,12 +89,12 @@ def test_mean_vjp1(self):
             x.stop_gradient = False
             paddle.mean(x, axis=[0, 1])
             paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
-        newir_program = ir.translate_to_new_ir(main_program.desc)
+        newir_program = pir.translate_to_new_ir(main_program.desc)
         fill_constant_op = newir_program.global_block().ops[-1]
         mean_op = newir_program.global_block().ops[-2]
         out_grads = [[fill_constant_op.result(0)]]
         stop_gradients = [[False]]
-        with paddle.ir.core.program_guard(newir_program):
+        with paddle.pir.core.program_guard(newir_program):
             grad_outs = call_vjp(mean_op, out_grads, stop_gradients)
             self.assertEqual(
                 grad_outs[0][0].get_defining_op().name(), "pd_op.mean_grad"
@@ -129,12 +129,12 @@ def test_mean_vjp2(self):
             x.stop_gradient = False
             paddle.mean(x, axis=[0, 1])
             paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
-        newir_program = ir.translate_to_new_ir(main_program.desc)
+        newir_program = pir.translate_to_new_ir(main_program.desc)
         fill_constant_op = newir_program.global_block().ops[-1]
         mean_op = newir_program.global_block().ops[-2]
         out_grads = [[fill_constant_op.result(0)]]
         stop_gradients = [[True]]
-        with paddle.ir.core.program_guard(newir_program):
+        with paddle.pir.core.program_guard(newir_program):
             grad_outs = call_vjp(mean_op, out_grads, stop_gradients)
             self.assertEqual(grad_outs[0][0], None)
 
@@ -150,7 +150,7 @@ def test_has_vjp(self):
             x.stop_gradient = False
             paddle.mean(x, axis=[0, 1])
             paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
-        newir_program = ir.translate_to_new_ir(main_program.desc)
+        newir_program = pir.translate_to_new_ir(main_program.desc)
         fill_constant_op = newir_program.global_block().ops[-1]
         mean_op = newir_program.global_block().ops[-2]
         self.assertEqual(has_vjp(fill_constant_op), False)
diff --git a/test/ir/new_ir/test_pass_manager.py b/test/ir/new_ir/test_pass_manager.py
index 32428627d8b9e..5849b0bbdfeff 100644
--- a/test/ir/new_ir/test_pass_manager.py
+++ b/test/ir/new_ir/test_pass_manager.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir
+from paddle import pir
 from paddle.base import core
 from paddle.framework import LayerHelper
 
@@ -45,11 +45,11 @@ def test_op(self):
                     attrs={"name": out.name},
                 )
 
-        new_program = ir.translate_to_new_ir(main_program.desc)
+        new_program = pir.translate_to_new_ir(main_program.desc)
         op_names = [op.name() for op in new_program.global_block().ops]
         # print(op_names)
         self.assertTrue('pd_op.uniform' in op_names)
-        pm = ir.PassManager()
+        pm = pir.PassManager()
         pm.add_pass(
             'dead_code_elimination'
         )  # apply pass to elimitate dead code
diff --git a/test/ir/new_ir/test_special_op_translator.py b/test/ir/new_ir/test_special_op_translator.py
index 24eadccf034bd..a2a17feb1275f 100644
--- a/test/ir/new_ir/test_special_op_translator.py
+++ b/test/ir/new_ir/test_special_op_translator.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 import paddle
-from paddle import ir
+from paddle import pir
 from paddle.base import core
 from paddle.framework import LayerHelper
 
@@ -35,7 +35,7 @@ def test_op(self):
                 x = paddle.to_tensor([2, 3, 4], 'float64')
                 y = paddle.cast(x, 'uint8')
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestElementwiseOpTranscriber(unittest.TestCase):
@@ -115,7 +115,7 @@ def test_op(self):
                 )
                 output = embedding(x)
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestIncrementOpTranscriber(unittest.TestCase):
@@ -129,7 +129,7 @@ def test_op(self):
                 data = paddle.zeros(shape=[1], dtype='float32')
                 counter = paddle.increment(data)
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestAssignValueOpTranscriber(unittest.TestCase):
@@ -146,7 +146,7 @@ def test_op(self):
                     stop_gradient=False,
                 )
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestRnnOpTranscriber(unittest.TestCase):
@@ -163,7 +163,7 @@ def test_op(self):
                 cell = paddle.nn.SimpleRNNCell(16, 32)
                 y, h = cell(x, prev_h)
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestEmptyVarTranslate(unittest.TestCase):
@@ -185,7 +185,7 @@ def test_op(self):
                 out2 = paddle.mean(out1)
                 sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.1)
                 sgd_optimizer.minimize(out2)
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestOneHotOpTranscriber(unittest.TestCase):
@@ -204,7 +204,7 @@ def test_mutable_attribute(self):
                     x=label, num_classes=depth
                 )
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
     def test_normal_attribute(self):
         place = core.Place()
@@ -221,7 +221,7 @@ def test_normal_attribute(self):
                     x=label, num_classes=depth
                 )
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestReduceOpTranscriber(unittest.TestCase):
@@ -271,7 +271,7 @@ def test_op(self):
                 value = paddle.randn([2])
                 y = paddle.index_put(x, indices, value, False)
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestGradAddOpTranscriber(unittest.TestCase):
@@ -297,7 +297,7 @@ def test_op(self):
                     attrs={"axis": -1},
                 )
 
-        _ = ir.translate_to_new_ir(main_program.desc)
+        _ = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestShadowOutputSlice(unittest.TestCase):
@@ -322,7 +322,7 @@ def test_op(self):
                     attrs={"name": out.name},
                 )
 
-        l = ir.translate_to_new_ir(main_program.desc)
+        l = pir.translate_to_new_ir(main_program.desc)
 
 
 class TestSetValueOp(unittest.TestCase):
@@ -418,7 +418,7 @@ def test_program(self):
             cell = paddle.nn.SimpleRNNCell(16, 32)
             y, h = cell(x, prev_h)
 
-        ops = ir.check_unregistered_ops(main_program.desc)
+        ops = pir.check_unregistered_ops(main_program.desc)
         assert len(ops) == 0
 
 
diff --git a/test/ir/new_ir/test_symbol_overload.py b/test/ir/new_ir/test_symbol_overload.py
index 160ba78b6582c..7847ac5508555 100644
--- a/test/ir/new_ir/test_symbol_overload.py
+++ b/test/ir/new_ir/test_symbol_overload.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 import paddle
-from paddle import _ir_ops, nn
+from paddle import _pir_ops, nn
 from paddle.autograd.ir_backward import grad
 
 paddle.enable_static()
@@ -28,11 +28,11 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x, y):
-        z1 = _ir_ops.add(x, y)
-        z2 = _ir_ops.multiply(x, y)
-        z3 = _ir_ops.subtract(z1, z2)
-        z4 = _ir_ops.scale(z3, -1, 0, True)
-        res = _ir_ops.divide(z3, z4)
+        z1 = _pir_ops.add(x, y)
+        z2 = _pir_ops.multiply(x, y)
+        z3 = _pir_ops.subtract(z1, z2)
+        z4 = _pir_ops.scale(z3, -1, 0, True)
+        res = _pir_ops.divide(z3, z4)
         return res
 
 
@@ -54,10 +54,10 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x, y):
-        z1 = _ir_ops.less_equal(x, y)
-        z2 = _ir_ops.greater_equal(x, y)
-        z3 = _ir_ops.less_than(x, y)
-        z4 = _ir_ops.greater_than(x, y)
+        z1 = _pir_ops.less_equal(x, y)
+        z2 = _pir_ops.greater_equal(x, y)
+        z3 = _pir_ops.less_than(x, y)
+        z4 = _pir_ops.greater_than(x, y)
         return z1, z2, z3, z4
 
 
diff --git a/test/ir/test_op_input_grad_semantic.py b/test/ir/test_op_input_grad_semantic.py
index 24bf506461e54..ab1ca4f61d191 100644
--- a/test/ir/test_op_input_grad_semantic.py
+++ b/test/ir/test_op_input_grad_semantic.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir
+from paddle import pir
 
 paddle.enable_static()
 
@@ -32,7 +32,7 @@ def get_gather_program_new_ir():
         index = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1.0)
         axis = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=2.0)
         out = paddle.gather(x, index, axis)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -49,7 +49,7 @@ def get_multiply_program_new_ir():
             shape=[3, 4], dtype='float32', value=3.0
         )
         out = paddle.multiply(x, y)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index 71e41b92e96dd..961820ff00b29 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -1348,7 +1348,7 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
                             else:
                                 fetch_list.append(var)
                     elif isinstance(
-                        ret_tuple, paddle.base.libpaddle.ir.OpResult
+                        ret_tuple, paddle.base.libpaddle.pir.OpResult
                     ):
                         fetch_list.append(ret_tuple)
                     else:
@@ -1411,7 +1411,7 @@ def _check_ir_output(self, place, program, feed_map, fetch_list, outs):
             )
             assert len(outs) == len(
                 ir_outs
-            ), "Fetch result should have same length when executed in new ir"
+            ), "Fetch result should have same length when executed in pir"
 
             check_method = np.testing.assert_array_equal
             if os.getenv("FLAGS_NEW_IR_OPTEST_RELAX_CHECK", None):
@@ -2307,7 +2307,7 @@ def _is_skip_name(self, name):
 
         class NewIRChecker(Checker):
             def init(self):
-                self.checker_name = "new ir checker"
+                self.checker_name = "pir checker"
 
             def calculate_output(self):
                 self.is_python_api_test = True
@@ -2406,8 +2406,8 @@ def find_imperative_expect(target_name, new_ir_outs, place):
                 )
 
             def find_actual_value(self, target_name):
-                with paddle.ir.core.program_guard(
-                    paddle.ir.core.default_main_program()
+                with paddle.pir.core.program_guard(
+                    paddle.pir.core.default_main_program()
                 ):
                     actual = find_imperative_actual(
                         target_name, self.outputs, place
@@ -2416,8 +2416,8 @@ def find_actual_value(self, target_name):
                     return actual, actual_t
 
             def find_expect_value(self, target_name):
-                with paddle.ir.core.program_guard(
-                    paddle.ir.core.default_main_program()
+                with paddle.pir.core.program_guard(
+                    paddle.pir.core.default_main_program()
                 ):
                     expect = find_imperative_expect(
                         target_name, self.ref_outputs, place
@@ -2427,7 +2427,7 @@ def find_expect_value(self, target_name):
 
             def _compare_list(self, name, actual, expect):
                 """if expect is a tuple, we need to compare list."""
-                with paddle.ir.core.program_guard(place=place):
+                with paddle.pir.core.program_guard(place=place):
                     self.op_test.assertListEqual(
                         actual.value()
                         .get_tensor()
@@ -3108,7 +3108,7 @@ def check_grad_with_place(
                     atol=atol,
                 )
 
-        # get new ir gradient
+        # get pir gradient
         if check_new_ir:
             if (
                 type(place) is paddle.base.libpaddle.CPUPlace
@@ -3593,7 +3593,7 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
                     cast_outputs = []
                     for cast_input in cast_inputs:
                         if isinstance(
-                            cast_input, paddle.base.libpaddle.ir.OpResult
+                            cast_input, paddle.base.libpaddle.pir.OpResult
                         ):
                             cast_outputs.append(
                                 paddle.cast(
diff --git a/test/legacy_test/test_fill_any_like_op.py b/test/legacy_test/test_fill_any_like_op.py
index 375f7e2a103b8..ebcbd57538421 100644
--- a/test/legacy_test/test_fill_any_like_op.py
+++ b/test/legacy_test/test_fill_any_like_op.py
@@ -30,13 +30,13 @@ def fill_any_like_wrapper(x, value, out_dtype=None, name=None):
         else:
             from paddle.base.libpaddle import DataType
 
-            tmp_dtype = DataType(paddle.ir.core.vartype_to_datatype[out_dtype])
+            tmp_dtype = DataType(paddle.pir.core.vartype_to_datatype[out_dtype])
     else:
         tmp_dtype = out_dtype
         if in_pir_mode() and isinstance(
             out_dtype, paddle.framework.core.VarDesc.VarType
         ):
-            tmp_dtype = paddle.ir.core.vartype_to_datatype[tmp_dtype]
+            tmp_dtype = paddle.pir.core.vartype_to_datatype[tmp_dtype]
     return paddle.full_like(x, value, tmp_dtype, name)
 
 
diff --git a/test/legacy_test/test_full_like_op.py b/test/legacy_test/test_full_like_op.py
index 96c9db25c29ae..137e536126bb4 100644
--- a/test/legacy_test/test_full_like_op.py
+++ b/test/legacy_test/test_full_like_op.py
@@ -32,13 +32,13 @@ def fill_any_like_wrapper(x, value, out_dtype=None, name=None):
         else:
             from paddle.base.libpaddle import DataType
 
-            tmp_dtype = DataType(paddle.ir.core.vartype_to_datatype[out_dtype])
+            tmp_dtype = DataType(paddle.pir.core.vartype_to_datatype[out_dtype])
     else:
         tmp_dtype = out_dtype
         if in_pir_mode() and isinstance(
             out_dtype, paddle.framework.core.VarDesc.VarType
         ):
-            tmp_dtype = paddle.ir.core.vartype_to_datatype[tmp_dtype]
+            tmp_dtype = paddle.pir.core.vartype_to_datatype[tmp_dtype]
     return paddle.full_like(x, value, tmp_dtype, name)
 
 
diff --git a/test/legacy_test/test_reduce_op.py b/test/legacy_test/test_reduce_op.py
index 6164d86bab140..d0bc79a3bd48a 100644
--- a/test/legacy_test/test_reduce_op.py
+++ b/test/legacy_test/test_reduce_op.py
@@ -1205,7 +1205,7 @@ def reduce_sum_wrapper2(x, axis=[0], dtype=None, keepdim=False):
         return paddle._C_ops.sum(x, axis, dtype, keepdim)
     else:
         if in_pir_mode():
-            return paddle._ir_ops.sum(x, axis, dtype, keepdim)
+            return paddle._pir_ops.sum(x, axis, dtype, keepdim)
 
 
 class Test8DReduce0(Test1DReduce):
diff --git a/test/prim/new_ir_prim/test_custom_vjp_trait.py b/test/prim/new_ir_prim/test_custom_vjp_trait.py
index 0d99d73cedd0b..273bd02a2ba76 100644
--- a/test/prim/new_ir_prim/test_custom_vjp_trait.py
+++ b/test/prim/new_ir_prim/test_custom_vjp_trait.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir, nn
+from paddle import nn, pir
 from paddle.base.core import has_custom_vjp
 
 paddle.enable_static()
@@ -30,7 +30,7 @@ def get_gelu_program_new_ir():
         x = paddle.static.data('x', [2, 3, 3], dtype='float32')
         net = nn.GELU()
         out = net(x)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -43,7 +43,7 @@ def get_multiply_program_new_ir():
         x = paddle.static.data('x', [2, 3, 3], dtype='float32')
         y = paddle.static.data('y', [2, 3, 3], dtype='float32')
         out = paddle.multiply(x, y)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
diff --git a/test/prim/new_ir_prim/test_decomp_op.py b/test/prim/new_ir_prim/test_decomp_op.py
index 19e4d16423ee2..3a70ea3389272 100644
--- a/test/prim/new_ir_prim/test_decomp_op.py
+++ b/test/prim/new_ir_prim/test_decomp_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir
+from paddle import pir
 from paddle.decomposition import decompose
 from paddle.framework import core
 
@@ -36,7 +36,7 @@ def get_ir_program():
         y_s = paddle.add(x_s, y_s)
         y_s = paddle.mean(y_s)
         y_s = paddle.tanh(y_s)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
diff --git a/test/prim/new_ir_prim/test_prim_custom_vjp.py b/test/prim/new_ir_prim/test_prim_custom_vjp.py
index 4ca737fa380e3..1091f6b4ec9ff 100644
--- a/test/prim/new_ir_prim/test_prim_custom_vjp.py
+++ b/test/prim/new_ir_prim/test_prim_custom_vjp.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 import paddle
-from paddle import _ir_ops, nn
+from paddle import _pir_ops, nn
 from paddle.autograd.ir_backward import grad
 from paddle.decomposition import decompose
 from paddle.framework import core
@@ -30,9 +30,9 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x, linear1_weight, linear2_weight):
-        x2 = _ir_ops.matmul(x, linear1_weight, False, False)
-        x3 = _ir_ops.gelu(x2, False)
-        res = _ir_ops.matmul(x3, linear2_weight, False, False)
+        x2 = _pir_ops.matmul(x, linear1_weight, False, False)
+        x3 = _pir_ops.gelu(x2, False)
+        res = _pir_ops.matmul(x3, linear2_weight, False, False)
         return res
 
 
diff --git a/test/prim/new_ir_prim/test_prim_simpnet.py b/test/prim/new_ir_prim/test_prim_simpnet.py
index ffef0766f3d32..85051a26c350e 100644
--- a/test/prim/new_ir_prim/test_prim_simpnet.py
+++ b/test/prim/new_ir_prim/test_prim_simpnet.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 import paddle
-from paddle import _ir_ops, nn
+from paddle import _pir_ops, nn
 from paddle.autograd.ir_backward import grad
 from paddle.decomposition import decompose
 from paddle.framework import core
@@ -30,9 +30,9 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x, linear1_weight, linear2_weight):
-        x2 = _ir_ops.matmul(x, linear1_weight, False, False)
-        x3 = _ir_ops.gelu(x2, False)
-        res = _ir_ops.matmul(x3, linear2_weight, False, False)
+        x2 = _pir_ops.matmul(x, linear1_weight, False, False)
+        x3 = _pir_ops.gelu(x2, False)
+        res = _pir_ops.matmul(x3, linear2_weight, False, False)
         return res
 
 
diff --git a/test/prim/new_ir_prim/test_vjp_prim.py b/test/prim/new_ir_prim/test_vjp_prim.py
index 798f1e2588b0b..2755f2854487f 100644
--- a/test/prim/new_ir_prim/test_vjp_prim.py
+++ b/test/prim/new_ir_prim/test_vjp_prim.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle import ir
+from paddle import pir
 from paddle.base.core import call_vjp
 
 paddle.enable_static()
@@ -39,7 +39,7 @@ def get_ir_divide_program():
         )
         dout.stop_gradient = False
         out = paddle.divide(x, y)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -57,7 +57,7 @@ def get_ir_sum_program():
         dout = paddle.tensor.fill_constant(shape=[], dtype='float32', value=1.0)
         dout.stop_gradient = False
         out = paddle.sum(x)
-    newir_program = ir.translate_to_new_ir(main_program.desc)
+    newir_program = pir.translate_to_new_ir(main_program.desc)
     return newir_program
 
 
@@ -70,7 +70,7 @@ def test_divide_grad_prim_case1(self):
             out_grads = [[dout]]
             stop_gradients = [[False], [False]]
             divide_op = newir_program.global_block().ops[-1]
-            with paddle.ir.core.program_guard(newir_program):
+            with paddle.pir.core.program_guard(newir_program):
                 grad_outs = call_vjp(divide_op, out_grads, stop_gradients)
             reshape_op2 = newir_program.global_block().ops[-1]
             reshape_op1 = newir_program.global_block().ops[-8]
@@ -112,7 +112,7 @@ def test_divide_grad_no_prim(self):
         out_grads = [[dout]]
         stop_gradients = [[False], [False]]
         divide_op = newir_program.global_block().ops[-1]
-        with paddle.ir.core.program_guard(newir_program):
+        with paddle.pir.core.program_guard(newir_program):
             grad_outs = call_vjp(divide_op, out_grads, stop_gradients)
         self.assertEqual(len(grad_outs), 2)
         self.assertEqual(
@@ -131,7 +131,7 @@ def test_sum_grad_prim(self):
             out_grads = [[dout]]
             stop_gradients = [[False]]
             sum_op = newir_program.global_block().ops[-1]
-            with paddle.ir.core.program_guard(newir_program):
+            with paddle.pir.core.program_guard(newir_program):
                 grad_outs = call_vjp(sum_op, out_grads, stop_gradients)
             expand_op = newir_program.global_block().ops[-1]
             self.assertEqual(len(grad_outs), 1)
@@ -158,7 +158,7 @@ def test_sum_grad_no_prim(self):
         out_grads = [[dout]]
         stop_gradients = [[False]]
         sum_op = newir_program.global_block().ops[-1]
-        with paddle.ir.core.program_guard(newir_program):
+        with paddle.pir.core.program_guard(newir_program):
             grad_outs = call_vjp(sum_op, out_grads, stop_gradients)
         self.assertEqual(len(grad_outs), 1)
         self.assertEqual(

From 579103d91eda9daf27603e8cf4d550ad426a7fdc Mon Sep 17 00:00:00 2001
From: JYChen <zoooo0820@qq.com>
Date: Fri, 22 Sep 2023 12:32:23 +0800
Subject: [PATCH 065/115] remove all content in paddle.base __all__ list
 (#57596)

---
 python/paddle/base/__init__.py                | 35 +------------------
 python/paddle/base/backward.py                |  5 +--
 python/paddle/base/compiler.py                |  8 +----
 python/paddle/base/data_feed_desc.py          |  2 +-
 python/paddle/base/data_feeder.py             |  2 +-
 python/paddle/base/dataset.py                 |  2 +-
 python/paddle/base/default_scope_funcs.py     |  9 +----
 python/paddle/base/device_worker.py           |  9 +----
 python/paddle/base/dygraph/__init__.py        |  1 -
 python/paddle/base/dygraph/base.py            | 11 +-----
 python/paddle/base/executor.py                |  2 +-
 python/paddle/base/framework.py               | 27 +-------------
 python/paddle/base/initializer.py             |  2 +-
 python/paddle/base/io.py                      |  9 ++---
 python/paddle/base/layer_helper_base.py       |  2 +-
 python/paddle/base/layers/__init__.py         |  3 +-
 .../base/layers/layer_function_generator.py   |  8 +----
 python/paddle/base/lod_tensor.py              |  2 +-
 python/paddle/base/log_helper.py              |  2 +-
 python/paddle/base/param_attr.py              |  5 +--
 python/paddle/base/reader.py                  |  2 +-
 python/paddle/base/trainer_desc.py            |  9 +----
 python/paddle/base/trainer_factory.py         |  2 +-
 python/paddle/base/unique_name.py             |  2 +-
 python/paddle/base/wrapped_decorator.py       |  2 +-
 25 files changed, 26 insertions(+), 137 deletions(-)

diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index 857d6d58e4718..4acf21c465776 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -128,40 +128,7 @@
 enable_imperative = enable_dygraph
 disable_imperative = disable_dygraph
 
-__all__ = (
-    framework.__all__
-    + executor.__all__
-    + trainer_desc.__all__
-    + lod_tensor.__all__
-    + data_feed_desc.__all__
-    + compiler.__all__
-    + backward.__all__
-    + [
-        'io',
-        'initializer',
-        'layers',
-        'dygraph',
-        'enable_dygraph',
-        'disable_dygraph',
-        'enable_imperative',
-        'disable_imperative',
-        'backward',
-        'LoDTensor',
-        'LoDTensorArray',
-        'CPUPlace',
-        'XPUPlace',
-        'CUDAPlace',
-        'CUDAPinnedPlace',
-        'IPUPlace',
-        'Tensor',
-        'ParamAttr',
-        'WeightNormParamAttr',
-        'DataFeeder',
-        'unique_name',
-        'Scope',
-        '_cuda_synchronize',
-    ]
-)
+__all__ = []
 
 
 def __bootstrap__():
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index a6786848ddb85..1c3c3a3f202ed 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -27,10 +27,7 @@
 from .data_feeder import check_type
 from .proto import framework_pb2
 
-__all__ = [
-    'append_backward',
-    'gradients',
-]
+__all__ = []
 
 _logger = log_helper.get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
diff --git a/python/paddle/base/compiler.py b/python/paddle/base/compiler.py
index 3ee939920dc2b..3f90f34af286e 100644
--- a/python/paddle/base/compiler.py
+++ b/python/paddle/base/compiler.py
@@ -18,13 +18,7 @@
 from . import core, framework
 from .framework import cpu_places, cuda_places, xpu_places
 
-__all__ = [
-    'CompiledProgram',
-    'ExecutionStrategy',
-    'BuildStrategy',
-    'IpuCompiledProgram',
-    'IpuStrategy',
-]
+__all__ = []
 
 ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy
diff --git a/python/paddle/base/data_feed_desc.py b/python/paddle/base/data_feed_desc.py
index de1b00d090bb1..933d4dd1c6619 100644
--- a/python/paddle/base/data_feed_desc.py
+++ b/python/paddle/base/data_feed_desc.py
@@ -16,7 +16,7 @@
 
 from paddle.base.proto import data_feed_pb2
 
-__all__ = ['DataFeedDesc']
+__all__ = []
 
 
 class DataFeedDesc:
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index a641e8be1275f..6efb86ffcc9ab 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -28,7 +28,7 @@
     in_pir_mode,
 )
 
-__all__ = ['DataFeeder']
+__all__ = []
 
 _PADDLE_DTYPE_2_NUMPY_DTYPE = {
     core.VarDesc.VarType.BOOL: 'bool',
diff --git a/python/paddle/base/dataset.py b/python/paddle/base/dataset.py
index 961a392349707..c15f6e8e6e68a 100644
--- a/python/paddle/base/dataset.py
+++ b/python/paddle/base/dataset.py
@@ -20,7 +20,7 @@
 from ..utils import deprecated
 from . import core
 
-__all__ = ['DatasetFactory', 'InMemoryDataset', 'QueueDataset']
+__all__ = []
 
 
 class DatasetFactory:
diff --git a/python/paddle/base/default_scope_funcs.py b/python/paddle/base/default_scope_funcs.py
index 992714e6cd409..dd820572e5edc 100644
--- a/python/paddle/base/default_scope_funcs.py
+++ b/python/paddle/base/default_scope_funcs.py
@@ -32,14 +32,7 @@
 
 __tl_scope__ = threading.local()
 
-__all__ = [
-    'get_cur_scope',
-    'enter_local_scope',
-    'leave_local_scope',
-    'var',
-    'find_var',
-    'scoped_function',
-]
+__all__ = []
 
 
 def get_cur_scope():
diff --git a/python/paddle/base/device_worker.py b/python/paddle/base/device_worker.py
index 706febd44ba0e..755f7257b735a 100644
--- a/python/paddle/base/device_worker.py
+++ b/python/paddle/base/device_worker.py
@@ -14,14 +14,7 @@
 """Definition of device workers."""
 import sys
 
-__all__ = [
-    'DeviceWorker',
-    'Hogwild',
-    'DownpourSGD',
-    'Section',
-    'DownpourSGDOPT',
-    'HeterSection',
-]
+__all__ = []
 
 
 class DeviceWorker:
diff --git a/python/paddle/base/dygraph/__init__.py b/python/paddle/base/dygraph/__init__.py
index 2ac4df711681c..1bfc0f2c8295d 100644
--- a/python/paddle/base/dygraph/__init__.py
+++ b/python/paddle/base/dygraph/__init__.py
@@ -29,4 +29,3 @@
 
 
 __all__ = []
-__all__ += base.__all__
diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py
index 22a63ff3c0190..0997d24ad4db4 100644
--- a/python/paddle/base/dygraph/base.py
+++ b/python/paddle/base/dygraph/base.py
@@ -28,16 +28,7 @@
 from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
 from .tracer import Tracer
 
-__all__ = [
-    'no_grad',
-    'no_grad_',
-    'grad',
-    'guard',
-    'enable_dygraph',
-    'disable_dygraph',
-    'enabled',
-    'to_variable',
-]
+__all__ = []
 
 NON_PERSISTABLE_VAR_NAME_SUFFIX = "__non_persistable"
 
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 17f119d2e8532..2d2b53e71f1e6 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -38,7 +38,7 @@
 from .trainer_factory import FetchHandlerMonitor, TrainerFactory
 from .wrapped_decorator import signature_safe_contextmanager
 
-__all__ = ['Executor', 'global_scope', 'scope_guard']
+__all__ = []
 
 g_scope = core.Scope()
 InferNativeConfig = core.NativeConfig
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 12fd05ca8d8c7..26cfadb44216d 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -40,32 +40,7 @@
 from .variable_index import _getitem_static, _setitem_static, _setitem_impl_
 import threading
 
-__all__ = [
-    'Program',
-    'default_startup_program',
-    'default_main_program',
-    'program_guard',
-    'name_scope',
-    'ipu_shard_guard',
-    'set_ipu_shard',
-    'cuda_places',
-    'cpu_places',
-    'xpu_places',
-    'cuda_pinned_places',
-    'in_dygraph_mode',
-    'in_pir_mode',
-    'in_dynamic_or_pir_mode',
-    'is_compiled_with_cinn',
-    'is_compiled_with_cuda',
-    'is_compiled_with_rocm',
-    'is_compiled_with_xpu',
-    'Variable',
-    'require_version',
-    'device_guard',
-    'set_flags',
-    'get_flags',
-    '_stride_in_no_check_dy2st_diff',
-]
+__all__ = []
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
 TEMP_VAR_NAME = core.kTempVarName()
diff --git a/python/paddle/base/initializer.py b/python/paddle/base/initializer.py
index 7443e63b13e52..19cf79e1b407f 100644
--- a/python/paddle/base/initializer.py
+++ b/python/paddle/base/initializer.py
@@ -16,7 +16,7 @@
 
 from .data_feeder import check_type
 
-__all__ = ['set_global_initializer']
+__all__ = []
 
 _global_weight_initializer_ = None
 _global_bias_initializer_ = None
diff --git a/python/paddle/base/io.py b/python/paddle/base/io.py
index 55f5c072f4e27..cb3ce3f2070cc 100644
--- a/python/paddle/base/io.py
+++ b/python/paddle/base/io.py
@@ -16,13 +16,10 @@
 
 from paddle.base.log_helper import get_logger
 
-from . import reader
-from .reader import (  # noqa: F401
-    PyReader,
-    DataLoader,
-)
+from . import reader  # noqa: F401
+from .reader import DataLoader, PyReader  # noqa: F401
 
-__all__ = reader.__all__
+__all__ = []
 
 
 _logger = get_logger(
diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py
index 87e90787931d4..6b506d6f192b9 100644
--- a/python/paddle/base/layer_helper_base.py
+++ b/python/paddle/base/layer_helper_base.py
@@ -30,7 +30,7 @@
 from .initializer import _global_bias_initializer, _global_weight_initializer
 from .param_attr import ParamAttr, WeightNormParamAttr
 
-__all__ = ['LayerHelperBase']
+__all__ = []
 
 
 class LayerHelperBase:
diff --git a/python/paddle/base/layers/__init__.py b/python/paddle/base/layers/__init__.py
index 60e0d1c922c75..002fd068930d9 100644
--- a/python/paddle/base/layers/__init__.py
+++ b/python/paddle/base/layers/__init__.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import io
+from . import io  # noqa: F401
 from . import math_op_patch
 
 
 __all__ = []
-__all__ += io.__all__
diff --git a/python/paddle/base/layers/layer_function_generator.py b/python/paddle/base/layers/layer_function_generator.py
index 82db72f7c4ce5..390096123ac4e 100644
--- a/python/paddle/base/layers/layer_function_generator.py
+++ b/python/paddle/base/layers/layer_function_generator.py
@@ -30,13 +30,7 @@
 from ..layer_helper import LayerHelper
 from ..proto import framework_pb2
 
-__all__ = [
-    'generate_layer_fn',
-    'generate_activation_fn',
-    'generate_inplace_fn',
-    'autodoc',
-    'templatedoc',
-]
+__all__ = []
 
 
 def _convert_(name):
diff --git a/python/paddle/base/lod_tensor.py b/python/paddle/base/lod_tensor.py
index 4be41d5cc6adc..7817b7fa3e759 100644
--- a/python/paddle/base/lod_tensor.py
+++ b/python/paddle/base/lod_tensor.py
@@ -17,7 +17,7 @@
 from . import core
 from .data_feeder import DataToLoDTensorConverter
 
-__all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
+__all__ = []
 
 
 def create_lod_tensor(data, recursive_seq_lens, place):
diff --git a/python/paddle/base/log_helper.py b/python/paddle/base/log_helper.py
index 3d6fd9b1f289d..abd815d369a3d 100644
--- a/python/paddle/base/log_helper.py
+++ b/python/paddle/base/log_helper.py
@@ -14,7 +14,7 @@
 
 import logging
 
-__all__ = ['get_logger']
+__all__ = []
 
 
 def get_logger(name, level, fmt=None):
diff --git a/python/paddle/base/param_attr.py b/python/paddle/base/param_attr.py
index 75064a449db38..33cbafa98f170 100644
--- a/python/paddle/base/param_attr.py
+++ b/python/paddle/base/param_attr.py
@@ -16,10 +16,7 @@
 from paddle.base.data_feeder import check_type
 from paddle.regularizer import WeightDecayRegularizer
 
-__all__ = [
-    'ParamAttr',
-    'WeightNormParamAttr',
-]
+__all__ = []
 
 
 class ParamAttr:
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index 4ec5d3c4a9607..e749e707b65c6 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -54,7 +54,7 @@
 # NOTE: [ avoid hanging & failed quickly ] These value is used in getting data from another process
 QUEUE_GET_TIMEOUT = 60
 
-__all__ = ['PyReader', 'DataLoader']
+__all__ = []
 
 data_loader_unique_name_generator = UniqueNameGenerator()
 
diff --git a/python/paddle/base/trainer_desc.py b/python/paddle/base/trainer_desc.py
index f64530ec02353..1533647270dfb 100644
--- a/python/paddle/base/trainer_desc.py
+++ b/python/paddle/base/trainer_desc.py
@@ -16,14 +16,7 @@
 import os
 import sys
 
-__all__ = [
-    'TrainerDesc',
-    'MultiTrainer',
-    'DistMultiTrainer',
-    'PipelineTrainer',
-    'HeterXpuTrainer',
-    'HeterPipelineTrainer',
-]
+__all__ = []
 
 
 class TrainerDesc:
diff --git a/python/paddle/base/trainer_factory.py b/python/paddle/base/trainer_factory.py
index e5c5fa48b7155..c5743ca22a29e 100644
--- a/python/paddle/base/trainer_factory.py
+++ b/python/paddle/base/trainer_factory.py
@@ -43,7 +43,7 @@
     PSGPUTrainer,
 )
 
-__all__ = ["TrainerFactory", "FetchHandlerMonitor"]
+__all__ = []
 
 
 class TrainerFactory:
diff --git a/python/paddle/base/unique_name.py b/python/paddle/base/unique_name.py
index c240273da890d..d3f6d41d3624d 100644
--- a/python/paddle/base/unique_name.py
+++ b/python/paddle/base/unique_name.py
@@ -16,7 +16,7 @@
 
 from .wrapped_decorator import signature_safe_contextmanager
 
-__all__ = ['generate', 'switch', 'guard']
+__all__ = []
 
 
 class UniqueNameGenerator:
diff --git a/python/paddle/base/wrapped_decorator.py b/python/paddle/base/wrapped_decorator.py
index 1567bb0d4c55c..354e9760c62c4 100644
--- a/python/paddle/base/wrapped_decorator.py
+++ b/python/paddle/base/wrapped_decorator.py
@@ -16,7 +16,7 @@
 
 import decorator
 
-__all__ = ['wrap_decorator', 'signature_safe_contextmanager']
+__all__ = []
 
 
 def wrap_decorator(decorator_func):

From aa692f97df376a12702172511d27eaf6ef0f9271 Mon Sep 17 00:00:00 2001
From: zhanglirong1999 <56445728+zhanglirong1999@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:30:19 +0800
Subject: [PATCH 066/115] [OneDNN] fix fast_rcnn bug (#57598)

---
 paddle/phi/kernels/funcs/data_layout_transform.cc | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/funcs/data_layout_transform.cc b/paddle/phi/kernels/funcs/data_layout_transform.cc
index 05a1ffa4f3053..b949b7945a046 100644
--- a/paddle/phi/kernels/funcs/data_layout_transform.cc
+++ b/paddle/phi/kernels/funcs/data_layout_transform.cc
@@ -84,6 +84,14 @@ void TransDataLayoutFromOneDNN(DataLayout in_layout,
   auto& pool = DeviceContextPool::Instance();
   auto* dev_ctx = dynamic_cast<OneDNNContext*>(pool.Get(place));
   auto& cpu_engine = dev_ctx->GetEngine();
+  auto in_dims = vectorize<int64_t>(in.dims());
+
+  auto md_dims = !in_dims.empty() ? in_dims : std::vector<int64_t>{1};
+  const auto src_mem_desc =
+      !in_dims.empty() ? in.mem_desc()
+                       : dnnl::memory::desc(md_dims,
+                                            ToOneDNNDataType(in.dtype()),
+                                            dnnl::memory::format_tag::x);
 
   dnnl::memory::desc out_mem_desc = make_memory_desc(in, out_layout);
 
@@ -100,8 +108,7 @@ void TransDataLayoutFromOneDNN(DataLayout in_layout,
 
     ReorderOneDNNHandler handler(in_tz, in.dtype(), in_type, cpu_engine);
 
-    auto reorder_src_memory_p =
-        handler.AcquireSrcMemory(in.mem_desc(), in_data);
+    auto reorder_src_memory_p = handler.AcquireSrcMemory(src_mem_desc, in_data);
     auto reorder_dst_memory_p =
         handler.AcquireDstMemory(out, out->mem_desc(), place);
     auto reorder_p =

From 571f32c7199d72abb70e350c86ff8d71c051cc13 Mon Sep 17 00:00:00 2001
From: Difer <707065510@qq.com>
Date: Fri, 22 Sep 2023 14:31:08 +0800
Subject: [PATCH 067/115] [NewIR] No.7 Migrate paddle.softmaxinto pir  (#57415)

---
 python/paddle/nn/functional/activation.py |  8 ++++++--
 test/legacy_test/test_softmax_op.py       | 14 ++++++++------
 test/mkldnn/test_softmax_mkldnn_op.py     |  3 +++
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index dcb0dfdded2b5..6cd2679aa075e 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -1196,9 +1196,13 @@ def softmax(x, axis=-1, dtype=None, name=None):
               [0.03205860, 0.08714432, 0.23688282, 0.64391426]]])
     """
 
-    if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
+    if (
+        (dtype is not None)
+        and (not isinstance(dtype, core.VarDesc.VarType))
+        and (not isinstance(dtype, core.DataType))
+    ):
         dtype = convert_np_dtype_to_dtype_(dtype)
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         outs_cast = x if dtype is None else _C_ops.cast(x, dtype)
         return _C_ops.softmax(outs_cast, axis)
     else:
diff --git a/test/legacy_test/test_softmax_op.py b/test/legacy_test/test_softmax_op.py
index e78f0da054576..ec5b15d70cd74 100644
--- a/test/legacy_test/test_softmax_op.py
+++ b/test/legacy_test/test_softmax_op.py
@@ -84,9 +84,9 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
+            self.check_output_with_place(place, atol=1e-5, check_new_ir=True)
         else:
-            self.check_output(check_prim=True)
+            self.check_output(check_prim=True, check_new_ir=True)
 
     def test_check_grad(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -99,6 +99,7 @@ def test_check_grad(self):
                     "Out",
                     max_relative_error=0.01,
                     check_dygraph=(not self.use_mkldnn),
+                    check_new_ir=True,
                 )
         else:
             self.check_grad(
@@ -107,6 +108,7 @@ def test_check_grad(self):
                 max_relative_error=0.01,
                 check_dygraph=(not self.use_mkldnn),
                 check_prim=True,
+                check_new_ir=True,
             )
 
 
@@ -144,9 +146,9 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
+            self.check_output_with_place(place, atol=1e-5, check_new_ir=True)
         else:
-            self.check_output(check_prim=True)
+            self.check_output(check_prim=True, check_new_ir=True)
 
 
 @unittest.skipIf(
@@ -178,9 +180,9 @@ def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         if self.use_cudnn:
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-5)
+            self.check_output_with_place(place, atol=1e-5, check_new_ir=True)
         else:
-            self.check_output(check_prim=True)
+            self.check_output(check_prim=True, check_new_ir=True)
 
 
 class TestSoftmaxOp2(TestSoftmaxOp):
diff --git a/test/mkldnn/test_softmax_mkldnn_op.py b/test/mkldnn/test_softmax_mkldnn_op.py
index d66ff320d9f26..4f1d4978a197e 100644
--- a/test/mkldnn/test_softmax_mkldnn_op.py
+++ b/test/mkldnn/test_softmax_mkldnn_op.py
@@ -27,8 +27,11 @@
     TestSoftmaxOp_ZeroDim1,
 )
 
+import paddle
 from paddle.base import core
 
+paddle.enable_static()
+
 
 def stable_softmax(x):
     """Compute the softmax of vector x in a numerically stable way."""

From 5838bff4504200a0af667edf3f7e8de1bb942b66 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Fri, 22 Sep 2023 15:24:54 +0800
Subject: [PATCH 068/115] support reshard with different mesh (#57568)

---
 paddle/fluid/pybind/auto_parallel_py.cc       |   5 +
 .../distributed/auto_parallel/CMakeLists.txt  |   3 +-
 .../distributed/auto_parallel/dist_tensor.cc  |  34 ++--
 .../auto_parallel/reshard_utils.cc            |  28 ++-
 .../distributed/auto_parallel/reshard_utils.h |   5 +
 .../same_status_reshard_function.cc           | 121 ++++++++++++
 .../same_status_reshard_function.h            |  34 ++++
 .../phi/core/distributed/store/store_utils.cc |  14 +-
 paddle/phi/kernels/cpu/p_recv_kernel.cc       |   2 +
 paddle/phi/kernels/cpu/p_send_kernel.cc       |   2 +
 paddle/phi/kernels/gpu/p_recv_kernel.cu       |   2 +
 paddle/phi/kernels/gpu/p_send_kernel.cu       |   2 +
 paddle/phi/kernels/p_recv_kernel.h            |  14 ++
 test/auto_parallel/CMakeLists.txt             |   5 +
 test/auto_parallel/reshard_same_status.py     | 174 ++++++++++++++++++
 .../auto_parallel/test_reshard_same_status.py |  44 +++++
 test/cpp/auto_parallel/dist_tensor_test.cc    |   6 +
 17 files changed, 466 insertions(+), 29 deletions(-)
 create mode 100644 paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.cc
 create mode 100644 paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.h
 create mode 100644 test/auto_parallel/reshard_same_status.py
 create mode 100644 test/auto_parallel/test_reshard_same_status.py

diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index fe555cacb3e18..f6596f3db31d5 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -45,6 +45,7 @@
 #include "paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h"
+#include "paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.h"
 #include "paddle/phi/core/enforce.h"
 
 #ifdef PADDLE_WITH_DISTRIBUTE
@@ -200,6 +201,10 @@ void BindAutoParallel(py::module *m) {
       *m, "SameNdMeshReshardFunction", ReshardFunction)
       .def(py::init<>());
 
+  py::class_<phi::distributed::SameStatusReshardFunction>(
+      *m, "SameStatusReshardFunction", ReshardFunction)
+      .def(py::init<>());
+
   py::class_<ProcessMesh>(*m, "ProcessMesh")
       .def(py::init<>())
       .def(py::init<const std::vector<int64_t> &,
diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
index 038888196f09f..92e69e0dc7657 100644
--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -17,4 +17,5 @@ collect_srcs(
   r_to_p_reshard_function.cc
   p_to_r_reshard_function.cc
   s_to_s_reshard_function.cc
-  nd_mesh_reshard_function.cc)
+  nd_mesh_reshard_function.cc
+  same_status_reshard_function.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
index 6edc0bf188ee5..94d611e8043aa 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -18,6 +18,7 @@
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
+#include "paddle/phi/core/distributed/store/store_utils.h"
 
 namespace phi {
 namespace distributed {
@@ -35,20 +36,27 @@ inline void check_defined(const DistTensor& dist_tensor,
 DistTensor::DistTensor(const phi::DenseTensor& global_value,
                        const TensorDistAttr& dist_attr)
     : dims_(global_value.dims()), dist_attr_(dist_attr), value_(global_value) {
-  if (value_.initialized() && !dist_attr.is_replicated()) {
-    // 1. create replicated global tensor
-    int64_t dims_size = global_value.dims().size();
-    std::vector<int64_t> dims_mapping(dims_size, -1);
-    dist_attr_.set_dims_mapping(dims_mapping);
-    if (dist_attr_.is_partial()) {
-      dist_attr_.clean_partial_status();
+  // TODO(liyurui): This is a temporary solution. We need to support only infer
+  // meta when the input dense_tensor is empty.
+  // Support the value in DistTensor only has DenseTensor meta
+  // but without actual data. So we can visit its meta attr even if it is
+  // undefined.
+  if (IsCurRankInMesh(dist_attr.process_mesh())) {
+    if (value_.initialized() && !dist_attr.is_replicated()) {
+      // 1. create replicated global tensor
+      int64_t dims_size = global_value.dims().size();
+      std::vector<int64_t> dims_mapping(dims_size, -1);
+      dist_attr_.set_dims_mapping(dims_mapping);
+      if (dist_attr_.is_partial()) {
+        dist_attr_.clean_partial_status();
+      }
+      dist_attr_.set_dims_mapping(dims_mapping);
+
+      // 2. reshard from replicated to other state
+      auto* func = ChooseProperReshardFunction(*this, dist_attr);
+      auto* dev_ctx = DeviceContextPool::Instance().Get(global_value.place());
+      func->Eval(dev_ctx, *this, dist_attr, this);
     }
-    dist_attr_.set_dims_mapping(dims_mapping);
-
-    // 2. reshard from replicated to other state
-    auto* func = ChooseProperReshardFunction(*this, dist_attr);
-    auto* dev_ctx = DeviceContextPool::Instance().Get(global_value.place());
-    func->Eval(dev_ctx, *this, dist_attr, this);
   }
 }
 
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
index 2767dfa836394..60c9cbdda3b67 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
@@ -24,13 +24,6 @@ namespace phi {
 namespace distributed {
 
 namespace {
-int64_t GetLocalRankInParticipate(const std::vector<int64_t>& process_ids) {
-  int64_t cur_global_rank = GetCurGlobalRank();
-  auto iter =
-      std::find(process_ids.begin(), process_ids.end(), cur_global_rank);
-  return iter - process_ids.begin();
-}
-
 std::string GenUniqueCommKey(const std::vector<int64_t>& process_ids) {
   std::string unique_comm_key = "ReshardGroup";
   for (const auto& id : process_ids) {
@@ -40,6 +33,20 @@ std::string GenUniqueCommKey(const std::vector<int64_t>& process_ids) {
 }
 }  // namespace
 
+int64_t GetLocalRankInParticipate(const std::vector<int64_t>& process_ids,
+                                  int64_t global_rank) {
+  if (global_rank == -1) {
+    global_rank = GetCurGlobalRank();
+  }
+  auto iter = std::find(process_ids.begin(), process_ids.end(), global_rank);
+  PADDLE_ENFORCE_NE(
+      iter,
+      process_ids.end(),
+      phi::errors::NotFound("Global rank %lld cannot be found in process_mesh",
+                            global_rank));
+  return iter - process_ids.begin();
+}
+
 std::vector<int64_t> GetCurRankCoordInMesh(const ProcessMesh& process_mesh) {
   const auto& process_shape = process_mesh.shape();
   const auto& process_ids = process_mesh.process_ids();
@@ -132,5 +139,12 @@ std::vector<int64_t> BalancedSplit(int64_t total_nums, int64_t num_of_pieces) {
   return result;
 }
 
+bool IsCurRankInMesh(const ProcessMesh& process_mesh) {
+  int64_t cur_global_rank = GetCurGlobalRank();
+  const auto& process_ids = process_mesh.process_ids();
+  return (std::find(process_ids.begin(), process_ids.end(), cur_global_rank) !=
+          process_ids.end());
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
index 831a4c6e0d2af..652840976194f 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
@@ -30,6 +30,11 @@ class DeviceContext;
 namespace distributed {
 class ProcessMesh;
 
+bool IsCurRankInMesh(const ProcessMesh& process_mesh);
+
+int64_t GetLocalRankInParticipate(const std::vector<int64_t>& process_ids,
+                                  int64_t global_rank = -1);
+
 // Get the coordinate of cur rank in process mesh. For example, the process mesh
 // is [[0, 1], [2, 3], [4, 5], [6, 7]], if the current rank is 4, then will
 // return [2, 0]; if the current rank is 3, then will return [1, 1].
diff --git a/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.cc
new file mode 100644
index 0000000000000..a6f49268c5612
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.h"
+
+#include <algorithm>
+
+#include "glog/logging.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
+#include "paddle/phi/core/distributed/store/store_utils.h"
+#include "paddle/phi/kernels/p_recv_kernel.h"
+#include "paddle/phi/kernels/p_send_kernel.h"
+
+namespace phi {
+namespace distributed {
+
+namespace {
+
+std::vector<int64_t> GetUnionProcessIds(std::vector<int64_t> in_process_ids,
+                                        std::vector<int64_t> out_process_ids) {
+  std::vector<int64_t> result;
+  std::sort(in_process_ids.begin(), in_process_ids.end());
+  std::sort(out_process_ids.begin(), out_process_ids.end());
+  std::set_union(in_process_ids.begin(),
+                 in_process_ids.end(),
+                 out_process_ids.begin(),
+                 out_process_ids.end(),
+                 std::back_inserter(result));
+  return result;
+}
+
+}  // namespace
+
+bool SameStatusReshardFunction::IsSuitable(
+    const DistTensor& in, const TensorDistAttr& out_dist_attr) {
+  bool flag = true;
+  const auto& in_dist_attr = in.dist_attr();
+
+  flag &= (in_dist_attr.dims_mapping() == out_dist_attr.dims_mapping());
+  flag &= (in_dist_attr.partial_dims() == out_dist_attr.partial_dims());
+
+  const auto& in_process_mesh = in_dist_attr.process_mesh();
+  const auto& out_process_mesh = out_dist_attr.process_mesh();
+  flag &= (in_process_mesh != out_process_mesh);
+  flag &= (in_process_mesh.shape() == out_process_mesh.shape());
+
+  return flag;
+}
+
+void SameStatusReshardFunction::Eval(phi::DeviceContext* dev_ctx,
+                                     const DistTensor& in,
+                                     const TensorDistAttr& out_dist_attr,
+                                     DistTensor* out) {
+  const auto& in_dist_attr = in.dist_attr();
+  const auto& in_process_mesh = in_dist_attr.process_mesh();
+  const auto& in_process_ids = in_process_mesh.process_ids();
+  const auto& out_process_mesh = out_dist_attr.process_mesh();
+  const auto& out_process_ids = out_process_mesh.process_ids();
+  auto all_process_ids = GetUnionProcessIds(in_process_ids, out_process_ids);
+  auto dtype = in.dtype();
+  // TODO(liyurui): Use dynamic shape will lead to poor performance, but we
+  // don't have any other good idea now. For the following reasons:
+  // 1. We can not ensure the meta being right deduce by the infermeta.
+  // 2. The meta of some kernels can't decide in compile time.
+  // 3. DenseTensor with empty value only need infermeta and skip the real
+  // kernel execution.
+  bool dynamic_shape = true;
+
+  std::vector<std::pair<int64_t, int64_t>> p2p_pair;
+  for (size_t i = 0; i < out_process_ids.size(); ++i) {
+    p2p_pair.emplace_back(
+        std::make_pair(in_process_ids[i], out_process_ids[i]));
+  }
+
+  int64_t cur_global_rank = GetCurGlobalRank();
+  for (const auto& iter : p2p_pair) {
+    int64_t src = iter.first;
+    int64_t dst = iter.second;
+    VLOG(3) << "Send/Recv from src " << src << " to dst " << dst;
+    if (src == cur_global_rank) {
+      int64_t dst_local_rank = GetLocalRankInParticipate(all_process_ids, dst);
+      // Sice send kernel only has input, so we don't need to infermeta
+      // actually. According to this reason, just use the kernel directly.
+      RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                                PSendKernel,
+                                dtype,
+                                all_process_ids,
+                                in.value(),
+                                dst_local_rank,
+                                dynamic_shape);
+    } else if (dst == cur_global_rank) {
+      int64_t src_local_rank = GetLocalRankInParticipate(all_process_ids, src);
+      RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                                PRecv,
+                                dtype,
+                                all_process_ids,
+                                src_local_rank,
+                                dynamic_shape,
+                                GetMutableTensor(out));
+    }
+  }
+  SetDistProps(out, in.dims(), out_dist_attr);
+}
+
+REGISTER_RESHARD_FUNC(SameStatusReshardFunction);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.h b/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.h
new file mode 100644
index 0000000000000..38c044e083a09
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/same_status_reshard_function.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/reshard_function.h"
+
+namespace phi {
+namespace distributed {
+
+class SameStatusReshardFunction final : public ReshardFunction {
+ public:
+  bool IsSuitable(const DistTensor& in,
+                  const TensorDistAttr& out_dist_attr) override;
+
+  void Eval(DeviceContext* dev_ctx,
+            const DistTensor& in,
+            const TensorDistAttr& out_dist_attr,
+            DistTensor* out) override;
+};
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/store/store_utils.cc b/paddle/phi/core/distributed/store/store_utils.cc
index c2679ef2192a3..7730b23301af3 100644
--- a/paddle/phi/core/distributed/store/store_utils.cc
+++ b/paddle/phi/core/distributed/store/store_utils.cc
@@ -48,19 +48,17 @@ std::string GetMasterEndpoint() {
 
 int64_t GetCurGlobalRank() {
   const char* cur_rank = std::getenv("PADDLE_TRAINER_ID");
-  PADDLE_ENFORCE_NOT_NULL(
-      cur_rank,
-      phi::errors::NotFound(
-          "The environment variable 'PADDLE_TRAINER_ID' cannot be found."));
+  if (cur_rank == nullptr) {
+    return 0;
+  }
   return std::atoi(cur_rank);
 }
 
 int64_t GetGlobalWorldSize() {
   const char* world_size = std::getenv("PADDLE_TRAINERS_NUM");
-  PADDLE_ENFORCE_NOT_NULL(
-      world_size,
-      phi::errors::NotFound(
-          "The environment variable 'PADDLE_TRAINERS_NUM' cannot be found."));
+  if (world_size == nullptr) {
+    return 1;
+  }
   return std::atoi(world_size);
 }
 
diff --git a/paddle/phi/kernels/cpu/p_recv_kernel.cc b/paddle/phi/kernels/cpu/p_recv_kernel.cc
index 10526e6935e1e..425335e3ce8a3 100644
--- a/paddle/phi/kernels/cpu/p_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_recv_kernel.cc
@@ -54,6 +54,7 @@ PD_REGISTER_KERNEL(p_recv,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16) {}
 
@@ -67,5 +68,6 @@ PD_REGISTER_KERNEL(p_recv_array,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/p_send_kernel.cc b/paddle/phi/kernels/cpu/p_send_kernel.cc
index a786de7ecaf3b..d417f19314423 100644
--- a/paddle/phi/kernels/cpu/p_send_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_send_kernel.cc
@@ -53,6 +53,7 @@ PD_REGISTER_KERNEL(p_send,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16) {}
 
@@ -66,5 +67,6 @@ PD_REGISTER_KERNEL(p_send_array,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/p_recv_kernel.cu b/paddle/phi/kernels/gpu/p_recv_kernel.cu
index 6f737eece9f54..1e413797b6b89 100644
--- a/paddle/phi/kernels/gpu/p_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_recv_kernel.cu
@@ -190,6 +190,7 @@ PD_REGISTER_KERNEL(p_recv,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::bfloat16,
                    phi::dtype::float16) {}
@@ -218,6 +219,7 @@ PD_REGISTER_KERNEL(p_recv,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16) {}
 
diff --git a/paddle/phi/kernels/gpu/p_send_kernel.cu b/paddle/phi/kernels/gpu/p_send_kernel.cu
index 6bf5bcb8155f8..520adcf730a1d 100644
--- a/paddle/phi/kernels/gpu/p_send_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_send_kernel.cu
@@ -178,6 +178,7 @@ PD_REGISTER_KERNEL(p_send,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::bfloat16,
                    phi::dtype::float16) {}
@@ -206,6 +207,7 @@ PD_REGISTER_KERNEL(p_send,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16) {}
 
diff --git a/paddle/phi/kernels/p_recv_kernel.h b/paddle/phi/kernels/p_recv_kernel.h
index 4478c838a61ff..8a013c9e653fb 100644
--- a/paddle/phi/kernels/p_recv_kernel.h
+++ b/paddle/phi/kernels/p_recv_kernel.h
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_array.h"
+#include "paddle/phi/infermeta/nullary.h"
 
 namespace phi {
 
@@ -26,6 +27,19 @@ void PRecvKernel(const Context& dev_ctx,
                  bool dynamic_shape,
                  DenseTensor* out);
 
+template <typename T, typename Context>
+void PRecv(const Context& dev_ctx,
+           int peer,
+           bool dynamic_shape,
+           DenseTensor* out) {
+  MetaTensor out_meta(*out);
+  MetaTensor* out_meta_ptr = &out_meta;
+  DataType dtype = phi::CppTypeToDataType<T>::Type();
+
+  PRecvInferMeta(peer, dtype, out_meta_ptr);
+  PRecvKernel<T, Context>(dev_ctx, peer, dtype, dynamic_shape, out);
+}
+
 template <typename T, typename Context>
 void PRecvArrayKernel(const Context& dev_ctx,
                       int peer,
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 48df714387854..8efa6f6a5e400 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -106,6 +106,11 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_reshard_nd_mesh MODULES test_reshard_nd_mesh)
   set_tests_properties(test_reshard_nd_mesh
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
+
+  py_test_modules(test_reshard_same_status MODULES test_reshard_same_status)
+  set_tests_properties(test_reshard_same_status
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
+
   py_test_modules(test_semi_auto_parallel_basic MODULES
                   test_semi_auto_parallel_basic)
   set_tests_properties(test_semi_auto_parallel_basic
diff --git a/test/auto_parallel/reshard_same_status.py b/test/auto_parallel/reshard_same_status.py
new file mode 100644
index 0000000000000..f6c7c6eaff166
--- /dev/null
+++ b/test/auto_parallel/reshard_same_status.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle.framework import core
+
+
+def get_coord(mesh_list, rank):
+    x = 0
+    y = 0
+    for sub_list in mesh_list:
+        if rank in sub_list:
+            y = sub_list.index(rank)
+            return x, y
+        x += 1
+    return -1, -1
+
+
+class TestReshardSameStatus:
+    def __init__(self):
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        self._backend = os.getenv("backend")
+
+    def test_diff_1d_mesh_shard(self, dev_ctx):
+        paddle.seed(self._seeds)
+
+        in_mesh_list = [0]
+        out_mesh_list = [1]
+        in_mesh = dist.ProcessMesh(in_mesh_list, dim_names=["x"])
+        value = paddle.uniform(self._shape, self._dtype)
+
+        in_shard_specs = [None for i in range(len(self._shape))]
+        in_shard_specs[0] = "x"
+        dist_attr = dist.DistAttr(mesh=in_mesh, sharding_specs=in_shard_specs)
+
+        in_expected_local_tensor_list = paddle.split(
+            value, num_or_sections=in_mesh.shape[0], axis=0
+        )
+        if dist.get_rank() in in_mesh_list:
+            index = in_mesh_list.index(dist.get_rank()) % in_mesh.shape[0]
+        elif dist.get_rank() in out_mesh_list:
+            index = out_mesh_list.index(dist.get_rank()) % in_mesh.shape[0]
+
+        input_tensor = dist.shard_tensor(value, dist_attr=dist_attr)
+
+        if dist.get_rank() in in_mesh_list:
+            # check the value of input tensor
+            in_expected_local_tensor_list = paddle.split(
+                value, num_or_sections=in_mesh.shape[0], axis=0
+            )
+            np.testing.assert_equal(
+                input_tensor._local_value().numpy(),
+                in_expected_local_tensor_list[index].numpy(),
+            )
+
+        out_mesh = dist.ProcessMesh(out_mesh_list, dim_names=["x"])
+        out_shard_specs = [None for i in range(len(self._shape))]
+        out_shard_specs[0] = "x"
+        out_dist_attr = dist.DistAttr(
+            mesh=out_mesh, sharding_specs=out_shard_specs
+        )
+
+        reshard_func = core.SameStatusReshardFunction()
+        assert reshard_func.is_suitable(input_tensor, out_dist_attr)
+
+        out = reshard_func.eval(dev_ctx, input_tensor, out_dist_attr)
+
+        if dist.get_rank() in out_mesh_list:
+            np.testing.assert_equal(
+                out._local_value().numpy(),
+                in_expected_local_tensor_list[index].numpy(),
+            )
+
+    def test_diff_nd_mesh_shard_partial(self, dev_ctx):
+        paddle.seed(self._seeds)
+
+        in_mesh_list = [[0], [1]]
+        out_mesh_list = [[1], [0]]
+        in_mesh = dist.ProcessMesh(in_mesh_list, dim_names=["x", "y"])
+        value = paddle.uniform(self._shape, self._dtype)
+
+        in_shard_specs = [None for i in range(len(self._shape))]
+        in_shard_specs[0] = "x"
+        dist_attr = dist.DistAttr(mesh=in_mesh, sharding_specs=in_shard_specs)
+        dist_attr._set_partial_dims([1])
+
+        input_tensor = dist.shard_tensor(value, dist_attr=dist_attr)
+
+        in_expected_local_tensor_list = paddle.split(
+            value, num_or_sections=in_mesh.shape[0], axis=0
+        )
+
+        in_flatten_list = [
+            item for sub_list in in_mesh_list for item in sub_list
+        ]
+        out_flatten_list = [
+            item for sub_list in out_mesh_list for item in sub_list
+        ]
+
+        in_x, in_y = get_coord(in_mesh_list, dist.get_rank())
+        out_x, out_y = get_coord(out_mesh_list, dist.get_rank())
+
+        if dist.get_rank() in in_flatten_list:
+            if in_y == 0:
+                np.testing.assert_equal(
+                    input_tensor._local_value().numpy(),
+                    in_expected_local_tensor_list[in_x].numpy(),
+                )
+            else:
+                zeros = paddle.zeros(input_tensor._local_shape)
+                np.testing.assert_equal(
+                    input_tensor._local_value().numpy(),
+                    zeros.numpy(),
+                )
+
+        out_mesh = dist.ProcessMesh(out_mesh_list, dim_names=["x", "y"])
+        out_shard_specs = [None for i in range(len(self._shape))]
+        out_shard_specs[0] = "x"
+        out_dist_attr = dist.DistAttr(
+            mesh=out_mesh, sharding_specs=out_shard_specs
+        )
+        out_dist_attr._set_partial_dims([1])
+
+        reshard_func = core.SameStatusReshardFunction()
+        assert reshard_func.is_suitable(input_tensor, out_dist_attr)
+
+        out = reshard_func.eval(dev_ctx, input_tensor, out_dist_attr)
+
+        if dist.get_rank() in out_flatten_list:
+            if out_y == 0:
+                np.testing.assert_equal(
+                    out._local_value().numpy(),
+                    in_expected_local_tensor_list[out_x].numpy(),
+                )
+            else:
+                zeros = paddle.zeros(out._local_shape)
+                np.testing.assert_equal(
+                    out._local_value().numpy(),
+                    zeros.numpy(),
+                )
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+            place = paddle.CPUPlace()
+        elif self._backend == "gpu":
+            place = paddle.CUDAPlace(dist.get_rank())
+
+        dev_ctx = core.DeviceContext.create(place)
+
+        self.test_diff_1d_mesh_shard(dev_ctx)
+        self.test_diff_nd_mesh_shard_partial(dev_ctx)
+
+
+if __name__ == '__main__':
+    TestReshardSameStatus().run_test_case()
diff --git a/test/auto_parallel/test_reshard_same_status.py b/test/auto_parallel/test_reshard_same_status.py
new file mode 100644
index 0000000000000..795c5b0e67520
--- /dev/null
+++ b/test/auto_parallel/test_reshard_same_status.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestReshardSameStatus(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(num_of_devices=2, timeout=120)
+        self._default_envs = {
+            "shape": "(6, 10, 20, 12)",
+            "dtype": "float32",
+            "seeds": "100",
+        }
+        self._changeable_envs = {
+            "backend": ["gpu"],
+        }
+
+    def test_reshard_same_status(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "reshard_same_status.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/cpp/auto_parallel/dist_tensor_test.cc b/test/cpp/auto_parallel/dist_tensor_test.cc
index c190c0e7b17ca..9882a4b831bb5 100644
--- a/test/cpp/auto_parallel/dist_tensor_test.cc
+++ b/test/cpp/auto_parallel/dist_tensor_test.cc
@@ -36,6 +36,12 @@ TEST(dist_tensor, constructor) {
 
   auto dist_attr = TensorDistAttr(phi::vectorize(dims));
 
+  std::vector<int64_t> mesh_shape = {1};
+  std::vector<int64_t> process_ids = {0};
+  std::vector<std::string> dim_names = {"x"};
+  ProcessMesh mesh(mesh_shape, process_ids, dim_names);
+  dist_attr.set_process_mesh(mesh);
+
   // copy construct
   DenseTensor x1(alloc, meta);
   DistTensor dist_x1(x1, dist_attr);

From 548854588f05b014667bb6c3faf4d1936c112f48 Mon Sep 17 00:00:00 2001
From: PommesPeter <54879512+PommesPeter@users.noreply.github.com>
Date: Fri, 22 Sep 2023 15:27:21 +0800
Subject: [PATCH 069/115] [NewIR] No.17 Migrate paddle.exp into pir (#57122)

---
 .../op_generator/vjp_interface_gen_op_list.py |  2 ++
 paddle/fluid/primitive/codegen/gen.py         |  2 ++
 python/paddle/tensor/ops.py                   |  4 ++--
 test/legacy_test/test_activation_op.py        | 22 +++++++++++++------
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
index b454d3b961035..34b1c15afd946 100644
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
@@ -42,6 +42,7 @@
     "square",
     "dropout",
     'exp',
+    'expm1',
     'expand',
     'layer_norm',
     'reshape',
@@ -77,6 +78,7 @@
     "square",
     "dropout",
     'exp',
+    'expm1',
     'expand',
     'layer_norm',
     'reshape',
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 200b6a05b493f..48ef5e1a800bd 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -54,6 +54,7 @@
     'erf_grad',
     'expand_grad',
     'exp_grad',
+    'expm1_grad',
     'elementwise_pow_grad',
     'fused_softmax_mask_upper_triangle_grad',
     'matmul_grad',
@@ -121,6 +122,7 @@
     'erf_grad',
     'expand_grad',
     'exp_grad',
+    'expm1_grad',
     'multiply',
     'exp',
     'erf',
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index b61620d998b43..9fbc9d16baa66 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -654,7 +654,7 @@ def exp(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [0.67032003, 0.81873077, 1.10517097, 1.34985888])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.exp(x)
     else:
         check_variable_and_dtype(
@@ -704,7 +704,7 @@ def expm1(x, name=None):
             Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
             [-0.32967997, -0.18126924,  0.10517092,  0.34985882])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.expm1(x)
     else:
         check_variable_and_dtype(
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 8d1ee1ac5091a..915a10fdc180b 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -127,10 +127,10 @@ def setUp(self):
         self.convert_input_output()
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
     def init_dtype(self):
         self.dtype = np.float32
@@ -174,10 +174,12 @@ def setUp(self):
         self.convert_input_output()
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=False)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.006, check_new_ir=False
+        )
 
     def init_dtype(self):
         self.dtype = np.complex64
@@ -247,18 +249,24 @@ def setUp(self):
         self.convert_input_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_new_ir=True)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
 
 class TestExpm1_Complex64(TestExpm1):
     def init_dtype(self):
         self.dtype = np.complex64
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', check_new_ir=False)
+
+    def test_check_output(self):
+        self.check_output(check_new_ir=False)
+
 
-class TestExpm1_Complex128(TestExpm1):
+class TestExpm1_Complex128(TestExpm1_Complex64):
     def init_dtype(self):
         self.dtype = np.complex128
 

From 3b65af2883d063a5fa92cfee1ba91477a91b871a Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Fri, 22 Sep 2023 16:11:53 +0800
Subject: [PATCH 070/115] [PIR]Support optional input and output for pir api
 (#57492)

---
 .../fluid/pir/dialect/op_generator/api_gen.py | 199 +++++++++++++++---
 .../pir/dialect/op_generator/python_c_gen.py  |  28 ++-
 .../pir/dialect/operator/ir/CMakeLists.txt    |   6 +-
 .../fluid/pir/dialect/operator/utils/utils.cc |   4 +
 .../fluid/pir/dialect/operator/utils/utils.h  |   2 +
 paddle/fluid/pybind/eager_utils.cc            |  29 ++-
 paddle/fluid/pybind/eager_utils.h             |  11 +-
 .../fluid/pybind/manual_static_op_function.h  |   6 +-
 paddle/pir/core/builtin_op.cc                 |  14 +-
 9 files changed, 235 insertions(+), 64 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
index 851f318e9bc47..b83794cc1af70 100644
--- a/paddle/fluid/pir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -30,6 +30,7 @@
 
 #include <vector>
 
+#include "paddle/utils/optional.h"
 #include "paddle/pir/core/value.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
@@ -47,6 +48,7 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/core/builder.h"
 #include "paddle/pir/core/builtin_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 
 {body}
 
@@ -67,14 +69,46 @@
 
 API_IMPL_TEMPLATE = """
 {ret_type} {api_name}({args}){{
+    {handle_optional_inputs}
     {in_combine}
     {compute_op}
+    {handle_optional_outputs}
     {out_split}
     {return_result}
 }}
 
 """
 
+OPTIONAL_VECTOR_VALUE_INPUT_TEMPLATE = """
+    paddle::optional<pir::Value> optional_{name};
+    if (!{name}) {{
+        optional_{name} = paddle::make_optional<pir::Value>(pir::Value());
+    }} else {{
+        auto optional_{name}_combine_op = APIBuilder::Instance().GetBuilder()->Build<pir::CombineOp>({name}.get());
+        optional_{name} = paddle::make_optional<pir::Value>(optional_{name}_combine_op.out());
+    }}"""
+
+OPTIONAL_VALUE_INPUT_TEMPLATE = """
+    paddle::optional<pir::Value> optional_{name};
+    if (!{name}) {{
+        optional_{name} = paddle::make_optional<pir::Value>(pir::Value());
+    }} else {{
+        optional_{name} = {name};
+    }}"""
+
+OPTIONAL_OPRESULT_OUTPUT_TEMPLATE = """
+    paddle::optional<pir::OpResult> optional_{name};
+    if (!IsEmptyOpResult({op_name}_op.result({index}))) {{
+        optional_{name} = paddle::make_optional<pir::OpResult>({op_name}_op.result({index}));
+    }}"""
+
+OPTIONAL_VECTOR_OPRESULT_OUTPUT_TEMPLATE = """
+    paddle::optional<std::vector<pir::OpResult>> optional_{name};
+    if (!IsEmptyOpResult({op_name}_op.result({index}))) {{
+        auto optional_{name}_slice_op = APIBuilder::Instance().GetBuilder()->Build<pir::SplitOp>({op_name}_op.result({index}));
+        optional_{name} = paddle::make_optional<std::vector<pir::OpResult>>(optional_{name}_slice_op.outputs());
+    }}"""
+
 COMBINE_OP_TEMPLATE = """
     auto {op_name} = APIBuilder::Instance().GetBuilder()->Build<pir::CombineOp>({in_name});"""
 
@@ -88,6 +122,27 @@
 VECTOR_TYPE = 'pir::VectorType'
 INTARRAY_ATTRIBUTE = "paddle::dialect::IntArrayAttribute"
 
+INPUT_TYPE_MAP = {
+    'paddle::dialect::DenseTensorType': 'pir::Value',
+    'paddle::dialect::SelectedRowsType': 'pir::Value',
+    'pir::VectorType<paddle::dialect::DenseTensorType>': 'std::vector<pir::Value>',
+}
+OPTIONAL_INPUT_TYPE_MAP = {
+    'paddle::dialect::DenseTensorType': 'paddle::optional<pir::Value>',
+    'paddle::dialect::SelectedRowsType': 'paddle::optional<pir::Value>',
+    'pir::VectorType<paddle::dialect::DenseTensorType>': 'paddle::optional<std::vector<pir::Value>>',
+}
+OUTPUT_TYPE_MAP = {
+    'paddle::dialect::DenseTensorType': 'pir::OpResult',
+    'paddle::dialect::SelectedRowsType': 'pir::OpResult',
+    'pir::VectorType<paddle::dialect::DenseTensorType>': 'std::vector<pir::OpResult>',
+}
+OPTIONAL_OUTPUT_TYPE_MAP = {
+    'paddle::dialect::DenseTensorType': 'paddle::optional<pir::OpResult>',
+    'paddle::dialect::SelectedRowsType': 'paddle::optional<pir::OpResult>',
+    'pir::VectorType<paddle::dialect::DenseTensorType>': 'paddle::optional<std::vector<pir::OpResult>>',
+}
+
 
 def get_op_class_name(op_name):
     return to_pascal_case(op_name) + 'Op'
@@ -95,16 +150,7 @@ def get_op_class_name(op_name):
 
 class CodeGen:
     def __init__(self) -> None:
-        self._type_map = {
-            'paddle::dialect::DenseTensorType': 'pir::Value',
-            'paddle::dialect::SelectedRowsType': 'pir::Value',
-            'pir::VectorType<paddle::dialect::DenseTensorType>': 'std::vector<pir::Value>',
-        }
-        self._ret_type_map = {
-            'paddle::dialect::DenseTensorType': 'pir::OpResult',
-            'paddle::dialect::SelectedRowsType': 'pir::OpResult',
-            'pir::VectorType<paddle::dialect::DenseTensorType>': 'std::vector<pir::OpResult>',
-        }
+        pass
 
     def _parse_yaml(self, op_yaml_files, op_compat_yaml_file):
         op_compat_parser = OpCompatParser(op_compat_yaml_file)
@@ -141,16 +187,43 @@ def _need_skip(self, op_info, op_name):
             op_info.infer_meta_func is None and op_name not in PD_MANUAL_OP_LIST
         )
 
+    def _is_optional_input(self, op_info, input_name):
+        name_list = op_info.input_name_list
+        optional_list = op_info.input_optional_list
+        if (
+            input_name in name_list
+            and optional_list[name_list.index(input_name)] == 'true'
+        ):
+            return True
+        return False
+
+    def _is_optinonal_output(self, op_info, output_name):
+        inplace_map = op_info.inplace_map
+        input_optional_list = op_info.input_optional_list
+        input_name_list = op_info.input_name_list
+        if inplace_map is None:
+            return False
+
+        if output_name in inplace_map.keys():
+            input_index = input_name_list.index(inplace_map[output_name])
+            if input_optional_list[input_index] == 'true':
+                return True
+        return False
+
     # =====================================
     # Gen declare functions
     # =====================================
     def _gen_api_inputs(self, op_info):
         name_list = op_info.input_name_list
         type_list = op_info.input_type_list
-        assert len(name_list) == len(type_list)
+        optional_list = op_info.input_optional_list
+        assert len(name_list) == len(type_list) == len(optional_list)
         ret = []
-        for name, type in zip(name_list, type_list):
-            ret.append(f'const {self._type_map[type]}& {name}')
+        for name, type, optional in zip(name_list, type_list, optional_list):
+            if optional == 'true':
+                ret.append(f'const {OPTIONAL_INPUT_TYPE_MAP[type]}& {name}')
+            else:
+                ret.append(f'const {INPUT_TYPE_MAP[type]}& {name}')
         return ', '.join(ret)
 
     def _gen_api_attrs(
@@ -199,26 +272,31 @@ def _gen_api_args(
         return (inputs + ', ' + attrs).strip(', ')
 
     def _gen_ret_type(self, op_info):
+        name_list = op_info.output_name_list
         type_list = op_info.output_type_list
         intermediate_list = op_info.output_intermediate_list
-        assert len(type_list) == len(intermediate_list)
+        assert len(name_list) == len(type_list) == len(intermediate_list)
 
         output_num = len(type_list) - intermediate_list.count('true')
         if output_num > 1:
-            return 'std::tuple<{}>'.format(
-                ', '.join(
-                    [
-                        self._ret_type_map[type]
-                        for type, intermediate in zip(
-                            type_list, intermediate_list
-                        )
-                        if intermediate == 'false'
-                    ]
-                )
-            )
+            ret = []
+            for name, type, intermediate in zip(
+                name_list, type_list, intermediate_list
+            ):
+                if intermediate == 'true':
+                    continue
+                if self._is_optinonal_output(op_info, name):
+                    ret.append(OPTIONAL_OUTPUT_TYPE_MAP[type])
+                else:
+                    ret.append(OUTPUT_TYPE_MAP[type])
+            return 'std::tuple<{}>'.format(', '.join(ret))
         elif output_num == 1:
             index = intermediate_list.index('false')
-            return self._ret_type_map[type_list[index]]
+            name = name_list[index]
+            if self._is_optinonal_output(op_info, name):
+                return OPTIONAL_OUTPUT_TYPE_MAP[type_list[index]]
+            else:
+                return OUTPUT_TYPE_MAP[type_list[index]]
         elif output_num == 0:
             return 'void'
 
@@ -263,14 +341,56 @@ def _gen_h_file(self, op_info_items, namespaces, h_file_path):
     # =====================================
     # Gen impl functions
     # =====================================
+    def _gen_handle_optional_inputs(self, op_info):
+        name_list = op_info.input_name_list
+        optional_list = op_info.input_optional_list
+        type_list = op_info.input_type_list
+        assert len(name_list) == len(optional_list) == len(type_list)
+        ret = ''
+        for name, optional, type in zip(name_list, optional_list, type_list):
+            if optional == 'true':
+                if VECTOR_TYPE in type:
+                    ret += OPTIONAL_VECTOR_VALUE_INPUT_TEMPLATE.format(
+                        name=name
+                    )
+                else:
+                    ret += OPTIONAL_VALUE_INPUT_TEMPLATE.format(name=name)
+        return ret
+
+    def _gen_handle_optional_outputs(self, op_info, op_name):
+        name_list = op_info.output_name_list
+        type_list = op_info.output_type_list
+        intermediate_list = op_info.output_intermediate_list
+        ret = ''
+        for i, (name, type, intermediate) in enumerate(
+            zip(name_list, type_list, intermediate_list)
+        ):
+            if intermediate == 'true':
+                continue
+            if self._is_optinonal_output(op_info, name):
+                if VECTOR_TYPE in type:
+                    ret += OPTIONAL_VECTOR_OPRESULT_OUTPUT_TEMPLATE.format(
+                        name=name,
+                        op_name=op_name,
+                        index=i,
+                    )
+                else:
+                    ret += OPTIONAL_OPRESULT_OUTPUT_TEMPLATE.format(
+                        name=name,
+                        op_name=op_name,
+                        index=i,
+                    )
+        return ret
+
     def _gen_in_combine(self, op_info, is_mutable_attr, is_vector_mutable_attr):
         name_list = op_info.input_name_list
         type_list = op_info.input_type_list
-        assert len(name_list) == len(type_list)
+        optional_list = op_info.input_optional_list
+        assert len(name_list) == len(type_list) == len(optional_list)
         combine_op = ''
         combine_op_list = []
-        for name, type in zip(name_list, type_list):
-            if VECTOR_TYPE in type:
+        for name, type, optional in zip(name_list, type_list, optional_list):
+            if optional == 'false' and VECTOR_TYPE in type:
                 op_name = f'{name}_combine_op'
                 combine_op += COMBINE_OP_TEMPLATE.format(
                     op_name=op_name, in_name=name
@@ -313,7 +433,10 @@ def _gen_compute_op_args(
 
         for input_name, combine_op in zip(name_list, in_combine_op_list):
             if combine_op is None:
-                ret.append(input_name)
+                if self._is_optional_input(op_info, input_name):
+                    ret.append(f'optional_{input_name}.get()')
+                else:
+                    ret.append(input_name)
             else:
                 ret.append(f'{combine_op}.out()')
         if is_mutable_attr:
@@ -342,7 +465,13 @@ def _gen_out_split_and_ret_list(self, op_info, op_inst_name):
         name_list = op_info.output_name_list
         type_list = op_info.output_type_list
         intermediate_list = op_info.output_intermediate_list
-        assert len(name_list) == len(type_list) == len(intermediate_list)
+        optional_list = op_info.output_optional_list
+        assert (
+            len(name_list)
+            == len(type_list)
+            == len(intermediate_list)
+            == len(optional_list)
+        )
 
         split_op_str = ''
         ret_list = []
@@ -351,7 +480,9 @@ def _gen_out_split_and_ret_list(self, op_info, op_inst_name):
         ):
             if intermediate == 'true':
                 continue
-            if VECTOR_TYPE in type:
+            if self._is_optinonal_output(op_info, name):
+                ret_list.append(f'optional_{name}')
+            elif VECTOR_TYPE in type:
                 split_op_name = f'{name}_split_op'
                 split_op_str += SPLIT_OP_TEMPLATE.format(
                     op_name=split_op_name, in_name=f'{op_inst_name}.result({i})'
@@ -392,8 +523,12 @@ def _gen_one_impl(
             args=self._gen_api_args(
                 op_info, False, is_mutable_attr, is_vector_mutable_attr
             ),
+            handle_optional_inputs=self._gen_handle_optional_inputs(op_info),
             in_combine=in_combine,
             compute_op=compute_op,
+            handle_optional_outputs=self._gen_handle_optional_outputs(
+                op_info, op_name
+            ),
             out_split=out_split,
             return_result=self._gen_return_result(ret_list),
         )
diff --git a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
index adb5270e975e6..5db36e3fb06e7 100644
--- a/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
@@ -243,14 +243,24 @@ def _gen_h_file(self, op_info_items, namespaces, h_file_path):
     def _gen_inputs(self, op_info, op_name):
         name_list = op_info.input_name_list
         type_list = op_info.input_type_list
-        assert len(name_list) == len(type_list)
+        optional_list = op_info.input_optional_list
+        assert len(name_list) == len(type_list) == len(optional_list)
         ret = ''
-        for i, (name, type) in enumerate(zip(name_list, type_list)):
-            cast_func = (
-                'CastPyArg2VectorOfValue'
-                if VECTOR_TYPE in type
-                else 'CastPyArg2OpResult'
-            )
+        for i, (name, type, optional) in enumerate(
+            zip(name_list, type_list, optional_list)
+        ):
+            if optional == 'true':
+                cast_func = (
+                    'CastPyArg2OptionalVectorOfValue'
+                    if VECTOR_TYPE in type
+                    else 'CastPyArg2OptionalValue'
+                )
+            else:
+                cast_func = (
+                    'CastPyArg2VectorOfValue'
+                    if VECTOR_TYPE in type
+                    else 'CastPyArg2Value'
+                )
             ret += INPUT_TEMPLATE.format(
                 name=name, index=i, cast_func=cast_func, api_name=op_name
             )
@@ -316,7 +326,7 @@ def _gen_cast_attrs(self, op_info, op_name):
                         type='',
                         name_=name,
                         name=name,
-                        cast_func='CastPyArg2OpResult',
+                        cast_func='CastPyArg2Value',
                         api_name=op_name,
                         index=input_size + i,
                     )
@@ -338,7 +348,7 @@ def _gen_cast_attrs(self, op_info, op_name):
                         type='',
                         name_=name,
                         name=name,
-                        cast_func='CastPyArg2OpResult',
+                        cast_func='CastPyArg2Value',
                         api_name=op_name,
                         index=input_size + i,
                     )
diff --git a/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
index 08a737f41fca2..befbb84a7117d 100644
--- a/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
@@ -199,7 +199,11 @@ cc_library(
 cc_library(
   pd_op_dialect_api
   SRCS ${api_source_file} manual_api.cc
-  DEPS api_builder pd_op_dialect_op)
+  DEPS api_builder pd_op_dialect_op pd_op_dialect_utils)
+if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+  set_target_properties(pd_op_dialect_api PROPERTIES COMPILE_FLAGS
+                                                     "-Wno-maybe-uninitialized")
+endif()
 
 target_include_directories(pd_op_dialect_api INTERFACE ${PD_DIALECT_BINARY_DIR})
 
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index 29e2d7daf0ebe..843aa0ae44847 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -197,5 +197,9 @@ VariantType GetAttributeData(const pir::Attribute& attr) {
 
 bool IsLegacyOp(const std::string& name) { return LegacyOpList.count(name); }
 
+bool IsEmptyOpResult(const pir::OpResult& op_result) {
+  return !op_result.impl() || op_result.type().isa<pir::Type>();
+}
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.h b/paddle/fluid/pir/dialect/operator/utils/utils.h
index 0d24a03dc635d..791cc2bb005c3 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.h
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.h
@@ -170,5 +170,7 @@ VariantType GetAttributeData(const pir::Attribute& attr);
 
 bool IsLegacyOp(const std::string& name);
 
+bool IsEmptyOpResult(const pir::OpResult& op_result);
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 87660d9fd88ca..1c32c8e9b6a94 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1516,13 +1516,11 @@ paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj,
   }
 }
 
-pir::OpResult CastPyArg2OpResult(PyObject* obj,
-                                 const std::string& op_type,
-                                 size_t arg_pos) {
+pir::Value CastPyArg2Value(PyObject* obj,
+                           const std::string& op_type,
+                           size_t arg_pos) {
   if (PyObject_TypeCheck(obj, g_ir_opresult_pytype)) {
     return ::pybind11::handle(obj).cast<pir::OpResult>();
-  } else if (obj == nullptr || obj == Py_None) {
-    return pir::OpResult();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
@@ -1533,6 +1531,16 @@ pir::OpResult CastPyArg2OpResult(PyObject* obj,
   }
 }
 
+paddle::optional<pir::Value> CastPyArg2OptionalValue(PyObject* obj,
+                                                     const std::string& op_type,
+                                                     size_t arg_pos) {
+  if (obj == nullptr || obj == Py_None) {
+    return paddle::none;
+  }
+  return paddle::make_optional<pir::Value>(
+      CastPyArg2Value(obj, op_type, arg_pos));
+}
+
 std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
                                                 const std::string& op_type,
                                                 size_t arg_pos) {
@@ -1577,8 +1585,6 @@ std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
     }
   } else if (PyObject_TypeCheck(obj, g_ir_opresult_pytype)) {
     return {::pybind11::handle(obj).cast<pir::Value>()};
-  } else if (obj == Py_None) {
-    return {};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
@@ -1590,6 +1596,15 @@ std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
   return value_list;
 }
 
+paddle::optional<std::vector<pir::Value>> CastPyArg2OptionalVectorOfValue(
+    PyObject* obj, const std::string& op_type, size_t arg_pos) {
+  if (obj == nullptr || obj == Py_None) {
+    return paddle::none;
+  }
+  return paddle::make_optional<std::vector<pir::Value>>(
+      CastPyArg2VectorOfValue(obj, op_type, arg_pos));
+}
+
 paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
                                               const std::string& op_type,
                                               ssize_t arg_pos) {
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 3cfeb1b5c37b3..b8d6a38546573 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -76,12 +76,17 @@ std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
 std::vector<int64_t> CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos);
 std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos);
 std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos);
-pir::OpResult CastPyArg2OpResult(PyObject* obj,
-                                 const std::string& op_type,
-                                 size_t arg_pos);
+pir::Value CastPyArg2Value(PyObject* obj,
+                           const std::string& op_type,
+                           size_t arg_pos);
+paddle::optional<pir::Value> CastPyArg2OptionalValue(PyObject* obj,
+                                                     const std::string& op_type,
+                                                     size_t arg_pos);
 std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
                                                 const std::string& op_type,
                                                 size_t arg_pos);
+paddle::optional<std::vector<pir::Value>> CastPyArg2OptionalVectorOfValue(
+    PyObject* obj, const std::string& op_type, size_t arg_pos);
 std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
     PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
index 7c32b2ab1d4fa..789aad40d132e 100644
--- a/paddle/fluid/pybind/manual_static_op_function.h
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -54,7 +54,7 @@ static PyObject *static_api_set_parameter(PyObject *self,
 
     // Get OpResult from args
     PyObject *parameter_obj = PyTuple_GET_ITEM(args, 0);
-    auto parameter = CastPyArg2OpResult(parameter_obj, "parameter", 0);
+    auto parameter = CastPyArg2Value(parameter_obj, "parameter", 0);
 
     // Parse Attributes
     PyObject *name_obj = PyTuple_GET_ITEM(args, 1);
@@ -94,7 +94,7 @@ PyObject *static_api_full(PyObject *self, PyObject *args, PyObject *kwargs) {
       pir::Value shape, value;
 
       if (PyObject_CheckIROpResult(shape_obj)) {
-        shape = CastPyArg2OpResult(shape_obj, "full", 0);
+        shape = CastPyArg2Value(shape_obj, "full", 0);
       } else if (PyObject_CheckIRVectorOfOpResult(shape_obj)) {
         std::vector<pir::Value> shape_tmp =
             CastPyArg2VectorOfValue(shape_obj, "full", 0);
@@ -106,7 +106,7 @@ PyObject *static_api_full(PyObject *self, PyObject *args, PyObject *kwargs) {
       }
 
       if (PyObject_CheckIROpResult(value_obj)) {
-        value = CastPyArg2OpResult(value_obj, "full", 1);
+        value = CastPyArg2Value(value_obj, "full", 1);
       } else {
         float value_tmp = CastPyArg2Float(value_obj, "full", 1);
         value = paddle::dialect::full(std::vector<int64_t>{1},
diff --git a/paddle/pir/core/builtin_op.cc b/paddle/pir/core/builtin_op.cc
index c2e58fe385a2b..21e0357c700fc 100644
--- a/paddle/pir/core/builtin_op.cc
+++ b/paddle/pir/core/builtin_op.cc
@@ -163,16 +163,12 @@ void CombineOp::Build(Builder &builder,
                       OperationArgument &argument,
                       const std::vector<Value> &inputs) {
   argument.inputs = inputs;
-  if (inputs.size() == 0) {
-    argument.output_types.emplace_back(pir::Type());
-  } else {
-    std::vector<pir::Type> inputs_type(inputs.size());
-    for (size_t idx = 0; idx < inputs.size(); ++idx) {
-      inputs_type[idx] = inputs[idx].type();
-    }
-    argument.output_types.emplace_back(
-        pir::VectorType::get(builder.ir_context(), inputs_type));
+  std::vector<pir::Type> inputs_type(inputs.size());
+  for (size_t idx = 0; idx < inputs.size(); ++idx) {
+    inputs_type[idx] = inputs[idx].type();
   }
+  argument.output_types.emplace_back(
+      pir::VectorType::get(builder.ir_context(), inputs_type));
   PassStopGradientsDefaultly(argument);
 }
 

From 09b92cdc0debeb446b57e17bde7cb0cc0fb846b4 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Fri, 22 Sep 2023 16:57:25 +0800
Subject: [PATCH 071/115] [Fix] Remove useless print in test_initializer.py
 (#57621)

* add float64 for API TruncatedNormal in GPU and CPU

* add float64 for API Assign in GPU and CPU

* remove _fp32 for 2 UT function

* add fp64 in static_ops.yaml

* remove TestAssignValueOp5

* add TestTruncatedNormalInitializerDygraph

* add unitest for Assign

* derived from unitest.TestCase

* update unitest

* add restore dtype code for unitest

* use dygraph_guard

* update fp64 for assign_value op maker

* update op_translator.cc

* update code

* update UT code

* remove reduncant code in paddle/fluid/ir_adaptor/translator/op_translator.cc

* remove redundant print
---
 test/legacy_test/test_initializer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/legacy_test/test_initializer.py b/test/legacy_test/test_initializer.py
index 903f47671549e..44c952fa38d74 100644
--- a/test/legacy_test/test_initializer.py
+++ b/test/legacy_test/test_initializer.py
@@ -1310,7 +1310,6 @@ def norm_cdf(x):
         _tensor = np.random.uniform(
             low=2 * l - 1, high=2 * u - 1, size=tensor.shape
         ).astype(paddle.get_default_dtype())
-        print(2 * l - 1, 2 * u - 1)
 
         # Use inverse cdf transform for normal distribution to get truncated
         # standard normal

From a14678933fb0b5f8badf8a095d22ee5aaae28f75 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 22 Sep 2023 17:46:37 +0800
Subject: [PATCH 072/115] [PIR] Support int/float/double for Operator Override
 (#57553)

* [PIR] Support int/float/double for Operator Override
---
 paddle/fluid/pybind/pir.cc               | 143 +++++++++++++++--------
 test/ir/new_ir/CMakeLists.txt            |   2 +-
 test/ir/new_ir/test_override_operator.py | 128 ++++++++++++++++++++
 3 files changed, 224 insertions(+), 49 deletions(-)
 create mode 100644 test/ir/new_ir/test_override_operator.py

diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 1eafe39eb19a5..56accc98a3fec 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -51,6 +51,7 @@
 namespace py = pybind11;
 using paddle::dialect::APIBuilder;
 using paddle::dialect::DenseTensorType;
+using paddle::dialect::SelectedRowsType;
 using pir::Block;
 using pir::Operation;
 using pir::OpOperand;
@@ -442,6 +443,70 @@ void SetOpResultBoolAttr(const OpResult &self,
       attr_name, pir::ArrayAttribute::get(pir::IrContext::Instance(), attrs));
 }
 
+phi::DataType GetOpResultDtype(const OpResult &result) {
+  if (result.type().isa<DenseTensorType>()) {
+    return paddle::dialect::TransToPhiDataType(
+        result.type().dyn_cast<DenseTensorType>().dtype());
+  } else if (result.type().isa<SelectedRowsType>()) {
+    return paddle::dialect::TransToPhiDataType(
+        result.type().dyn_cast<SelectedRowsType>().dtype());
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Currently, we can only get phi::DataType from DenseTensorType and "
+        "SelectedRowsType."));
+  }
+}
+
+#define OVERRIDE_OPERATOR(operator, api, other_type)              \
+  op_result.def(#operator, [](OpResult &self, other_type other) { \
+    return paddle::dialect::api(self, other);                     \
+  });
+
+#define OVERRIDE_OPERATOR_WITH_SCALE(operator,                    \
+                                     other_type,                  \
+                                     scale_value,                 \
+                                     bias_value,                  \
+                                     bias_after_scale)            \
+  op_result.def(#operator, [](OpResult &self, other_type other) { \
+    return paddle::dialect::scale(                                \
+        self, scale_value, bias_value, bias_after_scale);         \
+  });
+
+#define OVERRIDE_OPERATOR_FOR_EACH(operator,         \
+                                   api,              \
+                                   scale_value,      \
+                                   bias_value,       \
+                                   bias_after_scale) \
+  OVERRIDE_OPERATOR(operator, api, OpResult)         \
+  OVERRIDE_OPERATOR_WITH_SCALE(operator,             \
+                               int,                  \
+                               scale_value,          \
+                               bias_value,           \
+                               bias_after_scale)     \
+  OVERRIDE_OPERATOR_WITH_SCALE(operator,             \
+                               float,                \
+                               scale_value,          \
+                               bias_value,           \
+                               bias_after_scale)     \
+  OVERRIDE_OPERATOR_WITH_SCALE(operator,             \
+                               double,               \
+                               scale_value,          \
+                               bias_value,           \
+                               bias_after_scale)
+
+#define OVERRIDE_COMPARE_OP_WITH_FULL(operator, api, other_type)            \
+  op_result.def(#operator, [](OpResult &self, other_type other) {           \
+    auto rhs =                                                              \
+        paddle::dialect::full(/*shape=*/{}, other, GetOpResultDtype(self)); \
+    return paddle::dialect::api(self, rhs);                                 \
+  });
+
+#define OVERRIDE_COMPARE_OP_FOR_EACH(operator, api)   \
+  OVERRIDE_OPERATOR(operator, api, OpResult)          \
+  OVERRIDE_COMPARE_OP_WITH_FULL(operator, api, int)   \
+  OVERRIDE_COMPARE_OP_WITH_FULL(operator, api, float) \
+  OVERRIDE_COMPARE_OP_WITH_FULL(operator, api, double)
+
 void BindOpResult(py::module *m) {
   py::class_<OpResult> op_result(*m, "OpResult", R"DOC(
     OpResult class represents the value(output) defined by a result of operation.
@@ -451,6 +516,18 @@ void BindOpResult(py::module *m) {
         when build network.
   )DOC");
   g_ir_opresult_pytype = reinterpret_cast<PyTypeObject *>(op_result.ptr());
+
+  // For basaic operators
+  OVERRIDE_OPERATOR_FOR_EACH(__add__, add, 1.0, other, true);
+  OVERRIDE_OPERATOR_FOR_EACH(__sub__, subtract, 1.0, -1.0 * other, true);
+  OVERRIDE_OPERATOR_FOR_EACH(__mul__, multiply, other, 0.0, false);
+  OVERRIDE_OPERATOR_FOR_EACH(__truediv__, divide, 1.0 / other, 0.0, false);
+  // For compare opeartors
+  OVERRIDE_COMPARE_OP_FOR_EACH(__lt__, less_than);
+  OVERRIDE_COMPARE_OP_FOR_EACH(__le__, less_equal);
+  OVERRIDE_COMPARE_OP_FOR_EACH(__gt__, greater_than);
+  OVERRIDE_COMPARE_OP_FOR_EACH(__ge__, greater_equal);
+
   op_result.def("__eq__", &OpResult::operator==)
       .def("__eq__",
            [](OpResult &self, Value &other) {
@@ -460,42 +537,6 @@ void BindOpResult(py::module *m) {
            [](OpResult &self) {
              return paddle::dialect::scale(self, -1.0, 0.0, true);
            })
-      .def("__add__",
-           [](OpResult &self, OpResult &other) {
-             return paddle::dialect::add(self, other);
-           })
-      .def("__add__",
-           [](OpResult &self, float &bias) {
-             return paddle::dialect::scale(self, 1.0, bias, false);
-           })
-      .def("__sub__",
-           [](OpResult &self, OpResult &other) {
-             return paddle::dialect::subtract(self, other);
-           })
-      .def("__mul__",
-           [](OpResult &self, OpResult &other) {
-             return paddle::dialect::multiply(self, other);
-           })
-      .def("__truediv__",
-           [](OpResult &self, OpResult &other) {
-             return paddle::dialect::divide(self, other);
-           })
-      .def("__lt__",
-           [](OpResult &self, OpResult &other) {
-             return paddle::dialect::less_than(self, other);
-           })
-      .def("__le__",
-           [](OpResult &self, OpResult &other) {
-             return paddle::dialect::less_equal(self, other);
-           })
-      .def("__gt__",
-           [](OpResult &self, OpResult &other) {
-             return paddle::dialect::greater_than(self, other);
-           })
-      .def("__ge__",
-           [](OpResult &self, OpResult &other) {
-             return paddle::dialect::greater_equal(self, other);
-           })
       .def("__hash__",
            [](OpResult &self) { return std::hash<pir::Value>{}(self); })
       .def("__str__",
@@ -532,7 +573,8 @@ void BindOpResult(py::module *m) {
               return param_name;
             } else {
               PADDLE_THROW(phi::errors::InvalidArgument(
-                  "Currently, we can only get name of OpResult that is "
+                  "Currently, we can only get name of OpResult that "
+                  "is "
                   "persistable"));
             }
           })
@@ -554,8 +596,8 @@ void BindOpResult(py::module *m) {
             return GetOpResultBoolAttr(self, kAttrStopGradients);
           },
           [](OpResult &self, bool stop_gradient) {
-            // NOTE(Aurelius84): For other OpResult, set theirs stop_gradient
-            // default value as true.
+            // NOTE(Aurelius84): For other OpResult, set theirs
+            // stop_gradient default value as true.
             SetOpResultBoolAttr(self,
                                 kAttrStopGradients,
                                 stop_gradient,
@@ -567,8 +609,8 @@ void BindOpResult(py::module *m) {
             return GetOpResultBoolAttr(self, kAttrIsPersisable);
           },
           [](OpResult &self, bool is_persistable) {
-            // NOTE(Aurelius84): For other OpResult, set theirs is_persistable
-            // default value as false.
+            // NOTE(Aurelius84): For other OpResult, set theirs
+            // is_persistable default value as false.
             SetOpResultBoolAttr(self,
                                 kAttrIsPersisable,
                                 is_persistable,
@@ -582,7 +624,8 @@ void BindOpResult(py::module *m) {
                   self.type().dyn_cast<DenseTensorType>().dims());
             } else {
               PADDLE_THROW(phi::errors::InvalidArgument(
-                  "Currently, we can only get shape for dense tensor."));
+                  "Currently, we can only get shape for dense "
+                  "tensor."));
             }
           },
           [](OpResult &self, const std::vector<int> &shape) {
@@ -597,7 +640,8 @@ void BindOpResult(py::module *m) {
                   self.type().dyn_cast<DenseTensorType>().dtype());
             } else {
               PADDLE_THROW(phi::errors::InvalidArgument(
-                  "Currently, we can only get dtype for dense tensor."));
+                  "Currently, we can only get dtype for dense "
+                  "tensor."));
             }
           },
           [](OpResult &self, phi::DataType dtype) {
@@ -642,7 +686,8 @@ Operation *BuildOpFrom(
                  });
   auto *cloned_op = Operation::Create(std::move(to_create_argument));
 
-  // update the mapping of value_map. std::transform is a map(func, zip()).
+  // update the mapping of value_map. std::transform is a map(func,
+  // zip()).
   std::vector<int> tmp;
   std::transform(origin_results.begin(),
                  origin_results.end(),
@@ -905,10 +950,12 @@ SplitedResult ForwardBackwardSplit(
   mapping_value(forward_inputs, forward_value_map, fx);   // write 'fx'
   mapping_value(forward_inputs, backward_value_map, bx);  // write 'bx'
   mapping_value(forward_outputs, forward_value_map, fo);  // write 'fo'
-  mapping_value(
-      forward_inputs_grads, backward_value_map, bx_g);  // write 'fx_g'
-  mapping_value(
-      forward_outputs_grads, backward_value_map, bo_g);    // write 'bo_g'
+  mapping_value(forward_inputs_grads,
+                backward_value_map,
+                bx_g);  // write 'fx_g'
+  mapping_value(forward_outputs_grads,
+                backward_value_map,
+                bo_g);                                     // write 'bo_g'
   mapping_value(forward_outputs, backward_value_map, bo);  // write 'bo'
 
   std::map<std::string, std::vector<pir::Value>> attr = {{"fx", fx},
diff --git a/test/ir/new_ir/CMakeLists.txt b/test/ir/new_ir/CMakeLists.txt
index 75587db97c088..cad2633fb1aa4 100644
--- a/test/ir/new_ir/CMakeLists.txt
+++ b/test/ir/new_ir/CMakeLists.txt
@@ -6,7 +6,7 @@ string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
 
 set(TEST_IR_SYSTEM_CASES
     test_build_model test_pd_inplace_pass test_symbol_overload
-    test_new_ir_to_static test_stop_gradient)
+    test_new_ir_to_static test_stop_gradient test_override_operator)
 list(REMOVE_ITEM TEST_INTERP_CASES ${TEST_IR_SYSTEM_CASES})
 
 foreach(target ${TEST_INTERP_CASES})
diff --git a/test/ir/new_ir/test_override_operator.py b/test/ir/new_ir/test_override_operator.py
new file mode 100644
index 0000000000000..7d88a94b310e5
--- /dev/null
+++ b/test/ir/new_ir/test_override_operator.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+paddle.enable_static()
+
+
+class TestOperatorBase(unittest.TestCase):
+    def setUp(self):
+        self.shape = [4, 16]
+
+    def check_operator(self, operator_func, expected_out):
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            x = paddle.ones(self.shape, dtype='float32') * 2
+            out = operator_func(x)
+
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            res = exe.run(main_program, fetch_list=[out])
+            np.testing.assert_almost_equal(res[0], expected_out)
+
+
+class TestOperator(TestOperatorBase):
+    def test_add(self):
+        operator_func = lambda x: x + x
+        expected_out = np.ones(self.shape, dtype='float32') * 4
+        self.check_operator(operator_func, expected_out)
+
+    def test_sub(self):
+        operator_func = lambda x: x - x
+        expected_out = np.ones(self.shape, dtype='float32') * 0
+        self.check_operator(operator_func, expected_out)
+
+    def test_mul(self):
+        operator_func = lambda x: x * x
+        expected_out = np.ones(self.shape, dtype='float32') * 4
+        self.check_operator(operator_func, expected_out)
+
+    def test_div(self):
+        operator_func = lambda x: x / x
+        expected_out = np.ones(self.shape, dtype='float32') * 1
+        self.check_operator(operator_func, expected_out)
+
+
+class TestOperatorWithScale(TestOperatorBase):
+    def test_add(self):
+        operator_func = lambda x: x + 1
+        expected_out = np.ones(self.shape, dtype='float32') * 3
+        self.check_operator(operator_func, expected_out)
+
+    def test_sub(self):
+        operator_func = lambda x: x - 1.0
+        expected_out = np.ones(self.shape, dtype='float32')
+        self.check_operator(operator_func, expected_out)
+
+    def test_mul(self):
+        operator_func = lambda x: x * 2
+        expected_out = np.ones(self.shape, dtype='float32') * 4
+        self.check_operator(operator_func, expected_out)
+
+    def test_div(self):
+        operator_func = lambda x: x / 2.0
+        expected_out = np.ones(self.shape, dtype='float32') * 1
+        self.check_operator(operator_func, expected_out)
+
+
+class TestCompareOperator(TestOperatorBase):
+    def test_lt(self):
+        operator_func = lambda x: x < x - 1
+        expected_out = np.zeros(self.shape, dtype='bool')
+        self.check_operator(operator_func, expected_out)
+
+    def test_gt(self):
+        operator_func = lambda x: x > x - 1
+        expected_out = np.ones(self.shape, dtype='bool')
+        self.check_operator(operator_func, expected_out)
+
+    def test_le(self):
+        operator_func = lambda x: x <= x
+        expected_out = np.ones(self.shape, dtype='bool')
+        self.check_operator(operator_func, expected_out)
+
+    def test_ge(self):
+        operator_func = lambda x: x >= x + 1
+        expected_out = np.zeros(self.shape, dtype='bool')
+        self.check_operator(operator_func, expected_out)
+
+
+class TestCompareOpWithFull(TestOperatorBase):
+    def test_lt(self):
+        operator_func = lambda x: x < 1
+        expected_out = np.zeros(self.shape, dtype='bool')
+        self.check_operator(operator_func, expected_out)
+
+    def test_gt(self):
+        operator_func = lambda x: x > 1.0
+        expected_out = np.ones(self.shape, dtype='bool')
+        self.check_operator(operator_func, expected_out)
+
+    def test_le(self):
+        operator_func = lambda x: x <= 2
+        expected_out = np.ones(self.shape, dtype='bool')
+        self.check_operator(operator_func, expected_out)
+
+    def test_ge(self):
+        operator_func = lambda x: x >= 3.0
+        expected_out = np.zeros(self.shape, dtype='bool')
+        self.check_operator(operator_func, expected_out)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 14c282cfb04aca8f5127217eae53956f64bc549e Mon Sep 17 00:00:00 2001
From: tianhaodongbd <137985359+tianhaodongbd@users.noreply.github.com>
Date: Fri, 22 Sep 2023 18:48:26 +0800
Subject: [PATCH 073/115] Compilation optimization for broadcast (#57118)

* Compilation optimization for broadcast

* replaced ReduceKernel with SumKernel
---
 .../phi/kernels/gpu/llm_int8_linear_kernel.cu |  6 ++--
 .../phi/kernels/gpu/reduce_amin_amax_common.h | 32 ++++++++-----------
 paddle/phi/kernels/gpu/reduce_kernel.cu       | 11 ++-----
 3 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu b/paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu
index 3435a450ffd43..7af12773fb3b2 100644
--- a/paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu
+++ b/paddle/phi/kernels/gpu/llm_int8_linear_kernel.cu
@@ -16,6 +16,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11020
@@ -54,10 +55,7 @@ void llm_int8_compute(const Context& dev_ctx,
                        k,
                        n);
   if (bias) {
-    std::vector<const phi::DenseTensor*> ins = {out, &(bias.get())};
-    std::vector<phi::DenseTensor*> outs = {out};
-    phi::funcs::BroadcastKernel<T>(
-        dev_ctx, ins, &outs, phi::funcs::AddFunctor<T>());
+    phi::AddKernel<T, Context>(dev_ctx, *out, bias.get(), out);
   }
 #else
   PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
index fb0eace755ed5..8964c2547886b 100644
--- a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
+++ b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
@@ -14,11 +14,14 @@
 #pragma once
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
 
@@ -81,29 +84,20 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx,
   funcs::BroadcastKernel<T>(
       dev_ctx, equal_inputs, &equal_outputs, funcs::EqualFunctor<T>(), 0);
   // 2. equal_count = reduceSum(equal_out)
-  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  phi::funcs::
-      ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T, MPType>>(
-          dev_ctx,
-          equal_out_tensor,
-          equal_count,
-          kps::IdentityFunctor<T, MPType>(),
-          reduce_dims,
-          false);
+  phi::SumKernel<T, Context>(dev_ctx,
+                             equal_out_tensor,
+                             reduce_dims,
+                             equal_out_tensor.dtype(),
+                             false,
+                             equal_count);
 
   // 3. dx = dout * 1
-  std::vector<const phi::DenseTensor*> mul_inputs = {&new_dout,
-                                                     &equal_out_tensor};
-  std::vector<phi::DenseTensor*> mul_outputs = {&equal_out_tensor};
-  funcs::BroadcastKernel<T>(
-      dev_ctx, mul_inputs, &mul_outputs, funcs::MultiplyFunctor<T>(), 0);
+  phi::MultiplyKernel<T, Context>(
+      dev_ctx, new_dout, equal_out_tensor, &equal_out_tensor);
 
   // 4. dx = Div(dx, equal_out)
-  std::vector<const phi::DenseTensor*> grad_inputs = {&equal_out_tensor,
-                                                      equal_count};
-  std::vector<phi::DenseTensor*> grad_outputs = {new_dx_tensor};
-  funcs::BroadcastKernel<T>(
-      dev_ctx, grad_inputs, &grad_outputs, funcs::DivideFunctor<T>(), 0);
+  phi::DivideKernel<T, Context>(
+      dev_ctx, equal_out_tensor, *equal_count, new_dx_tensor);
   delete equal_out;
   delete equal_count;
 }
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
index 3b74ddb197024..969a3dd1d9ca5 100644
--- a/paddle/phi/kernels/gpu/reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -22,6 +22,7 @@
 #include "paddle/phi/kernels/reduce_sum_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
@@ -115,10 +116,7 @@ void ReduceMinGradKernel(const Context& dev_ctx,
       dev_ctx, equal_inputs, &equal_outputs, funcs::EqualFunctor<T>(), 0);
 
   // 2. dx = dout * 1
-  std::vector<const phi::DenseTensor*> mul_inputs = {&new_out_grad, equal_out};
-  std::vector<phi::DenseTensor*> mul_outputs = {x_grad};
-  funcs::BroadcastKernel<T>(
-      dev_ctx, mul_inputs, &mul_outputs, funcs::MultiplyFunctor<T>(), 0);
+  phi::MultiplyKernel<T, Context>(dev_ctx, new_out_grad, *equal_out, x_grad);
   delete equal_out;
 }
 
@@ -201,10 +199,7 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
       dev_ctx, equal_inputs, &equal_outputs, funcs::EqualFunctor<T>(), 0);
 
   // 2. dx = dout * 1
-  std::vector<const phi::DenseTensor*> mul_inputs = {&new_out_grad, equal_out};
-  std::vector<phi::DenseTensor*> mul_outputs = {x_grad};
-  funcs::BroadcastKernel<T>(
-      dev_ctx, mul_inputs, &mul_outputs, funcs::MultiplyFunctor<T>(), 0);
+  phi::MultiplyKernel<T, Context>(dev_ctx, new_out_grad, *equal_out, x_grad);
   delete equal_out;
 }
 

From f10dede4131953101c893d90ec0d3c230330b6f8 Mon Sep 17 00:00:00 2001
From: tianhaodongbd <137985359+tianhaodongbd@users.noreply.github.com>
Date: Fri, 22 Sep 2023 18:58:34 +0800
Subject: [PATCH 074/115] optimized dropout_kernel & unary_kernel (#57551)

---
 paddle/phi/kernels/funcs/dropout_impl.cu.h    | 17 ++++---------
 paddle/phi/kernels/sparse/gpu/unary_kernel.cu | 24 ++++---------------
 2 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index a1fc2c225ecf2..23756b3bdde96 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/functors.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace phi {
 namespace funcs {
@@ -255,17 +256,6 @@ __global__ void VectorizedGeneratorMask(const size_t n,
   }
 }
 
-template <typename T, typename MT>
-void ScaleByDropoutFactor(const phi::GPUContext& dev_ctx,
-                          const phi::DenseTensor& x,
-                          phi::DenseTensor* y,
-                          MT factor) {
-  std::vector<const phi::DenseTensor*> ins = {&x};
-  std::vector<phi::DenseTensor*> outs = {y};
-  auto functor = phi::funcs::ScaleFunctor<T>(factor);
-  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
-}
-
 template <typename T>
 void DropoutFwGPUKernelDriver(
     const phi::GPUContext& dev_ctx,
@@ -389,7 +379,7 @@ void DropoutFwGPUKernelDriver(
       using MT = typename phi::dtype::MPTypeTrait<T>::Type;
       MT factor = static_cast<MT>(1.0f - dropout_prob);
       // y = factor * x
-      ScaleByDropoutFactor<T, MT>(dev_ctx, x, y, factor);
+      phi::ScaleKernel<T, phi::GPUContext>(dev_ctx, x, factor, 0.0f, false, y);
     }
   }
 }
@@ -425,7 +415,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
   if (is_test) {
     MT factor = static_cast<MT>(upscale_in_train ? 1.0f : 1.0f - dropout_prob);
     // y = factor * x
-    ScaleByDropoutFactor<T, MT>(dev_ctx, grad_y, grad_x, factor);
+    phi::ScaleKernel<T, phi::GPUContext>(
+        dev_ctx, grad_y, factor, 0.0f, false, grad_x);
   } else {
     if (upscale_in_train && dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
index 3b6e84664f93c..22392a9fea5d5 100644
--- a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu
@@ -17,22 +17,12 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 #include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h"
 
 namespace phi {
 namespace sparse {
 
-template <typename T>
-struct DivScalarFunctor {
-  T value_;
-
-  explicit DivScalarFunctor(T value) : value_(value) {}
-
-  __device__ __forceinline__ T operator()(const T x) const {
-    return x / value_;
-  }
-};
-
 template <typename T, typename Context>
 void DivScalarCooKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
@@ -40,10 +30,8 @@ void DivScalarCooKernel(const Context& dev_ctx,
                         SparseCooTensor* out) {
   EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);
 
-  std::vector<const DenseTensor*> ins = {&(x.values())};
-  std::vector<DenseTensor*> outs = {out->mutable_values()};
-  DivScalarFunctor<T> func(static_cast<T>(scalar));
-  funcs::ElementwiseKernel<T, DivScalarFunctor<T>>(dev_ctx, ins, &outs, func);
+  phi::ScaleKernel<T, Context>(
+      dev_ctx, x.values(), 1 / scalar, 0.0f, false, out->mutable_values());
 }
 
 template <typename T, typename Context>
@@ -53,10 +41,8 @@ void DivScalarCsrKernel(const Context& dev_ctx,
                         SparseCsrTensor* out) {
   EmptyLikeCsrKernel<T, Context>(dev_ctx, x, out);
 
-  std::vector<const DenseTensor*> ins = {&(x.values())};
-  std::vector<DenseTensor*> outs = {out->mutable_values()};
-  DivScalarFunctor<T> func(static_cast<T>(scalar));
-  funcs::ElementwiseKernel<T, DivScalarFunctor<T>>(dev_ctx, ins, &outs, func);
+  phi::ScaleKernel<T, Context>(
+      dev_ctx, x.values(), 1 / scalar, 0.0f, false, out->mutable_values());
 }
 
 }  // namespace sparse

From acad2717347d89c8e565703d9ebf5f013111d362 Mon Sep 17 00:00:00 2001
From: tianhaodongbd <137985359+tianhaodongbd@users.noreply.github.com>
Date: Fri, 22 Sep 2023 18:59:35 +0800
Subject: [PATCH 075/115] optimized expand_as_kernel (#57509)

---
 paddle/phi/kernels/gpu/expand_as_kernel.cu | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/gpu/expand_as_kernel.cu b/paddle/phi/kernels/gpu/expand_as_kernel.cu
index 603e43482b929..b296b5b7e2014 100644
--- a/paddle/phi/kernels/gpu/expand_as_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_as_kernel.cu
@@ -18,6 +18,7 @@
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace phi {
@@ -70,11 +71,7 @@ void ExpandAsKernel(const Context& ctx,
     }
   }
 
-  out->Resize(phi::make_ddim(target_shape));
-  ctx.template Alloc<T>(out);
-  std::vector<const DenseTensor*> ins = {&x};
-  std::vector<DenseTensor*> outs = {out};
-  phi::funcs::BroadcastKernel<T>(ctx, ins, &outs, kps::IdentityFunctor<T>());
+  ExpandKernel<T, Context>(ctx, x, target_shape, out);
 }
 
 }  // namespace phi

From 2195859722b9422f3ae94ce03ace558e94d96637 Mon Sep 17 00:00:00 2001
From: Zhenghai Zhang <65210872+ccsuzzh@users.noreply.github.com>
Date: Fri, 22 Sep 2023 23:07:59 +0800
Subject: [PATCH 076/115] [NewIR] No.49 Migrate paddle.tril into pir (#57393)

---
 .../op_generator/vjp_interface_gen_op_list.py        |  4 ++++
 paddle/fluid/primitive/codegen/gen.py                |  4 ++++
 python/paddle/tensor/creation.py                     |  4 ++--
 test/legacy_test/test_tril_triu_op.py                | 12 ++++++++----
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
index 34b1c15afd946..62dd904dc7bf6 100644
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
@@ -58,6 +58,8 @@
     'slice_double',
     'poisson',
     'gumbel_softmax',
+    'tril',
+    'triu',
 ]
 vjp_interface_implementation_gen_op_list = [
     "tanh",
@@ -94,4 +96,6 @@
     'slice_double',
     'poisson',
     'gumbel_softmax',
+    'tril',
+    'triu',
 ]
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 48ef5e1a800bd..522fcda823ebb 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -38,6 +38,8 @@
 
 
 VJPS = [
+    'tril_grad',
+    'triu_grad',
     'tanh_grad',
     'mean_grad',
     'add_grad',
@@ -92,6 +94,8 @@
 VJP_COMPS = PRIM_VJP + CUSTOM_VJP
 
 BACKENDS = [
+    'tril_grad',
+    'triu_grad',
     'add_n',
     'mean',
     'sum',
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index e71b7ff65a63a..bb7d0f6142f4e 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1504,7 +1504,7 @@ def tril(x, diagonal=0, name=None):
              [5 , 0 , 0 , 0 ],
              [9 , 10, 0 , 0 ]])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.tril(x, diagonal)
     else:
         return _tril_triu_op(LayerHelper('tril', **locals()))
@@ -1581,7 +1581,7 @@ def triu(x, diagonal=0, name=None):
              [0 , 10, 11, 12]])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.triu(x, diagonal)
     else:
         return _tril_triu_op(LayerHelper('triu', **locals()))
diff --git a/test/legacy_test/test_tril_triu_op.py b/test/legacy_test/test_tril_triu_op.py
index a3add39f00f3f..1c64288dabbe5 100644
--- a/test/legacy_test/test_tril_triu_op.py
+++ b/test/legacy_test/test_tril_triu_op.py
@@ -45,10 +45,10 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_new_ir=True)
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -86,11 +86,15 @@ def initTestCase(self):
         self.X = np.arange(1, 101, dtype="float32").reshape([10, -1])
 
     def test_check_output(self):
-        self.check_output_with_place(core.CUDAPlace(0))
+        self.check_output_with_place(core.CUDAPlace(0), check_new_ir=True)
 
     def test_check_grad_normal(self):
         self.check_grad_with_place(
-            core.CUDAPlace(0), ['X'], 'Out', numeric_grad_delta=0.05
+            core.CUDAPlace(0),
+            ['X'],
+            'Out',
+            numeric_grad_delta=0.05,
+            check_new_ir=True,
         )
 
 

From 7a9669bb2fa71f18e3272e1e12b527d5f8c5a9e0 Mon Sep 17 00:00:00 2001
From: Ruibin Cheung <beinggod@foxmail.com>
Date: Fri, 22 Sep 2023 23:15:25 +0800
Subject: [PATCH 077/115] [PIR] No.31 Migrate paddle.arange into pir (#57344)

---
 python/paddle/tensor/attribute.py | 36 +++++++++++++++++++++++--------
 python/paddle/tensor/creation.py  | 22 +++++++++----------
 test/legacy_test/test_arange.py   | 11 ++++++----
 3 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 3b733a5f25b4a..f3dcaf06cd9bf 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -20,7 +20,11 @@
 from paddle import _C_ops
 
 from ..base.data_feeder import check_type, check_variable_and_dtype
-from ..base.framework import in_dygraph_mode, in_dynamic_or_pir_mode
+from ..base.framework import (
+    in_dygraph_mode,
+    in_dynamic_or_pir_mode,
+    in_pir_mode,
+)
 from ..common_ops_import import Variable
 from ..framework import LayerHelper, core
 from .creation import _complex_to_real_dtype, assign
@@ -233,16 +237,30 @@ def is_integer(x):
             >>> print(paddle.is_integer(x))
             True
     """
-    if not isinstance(x, (paddle.Tensor, paddle.static.Variable)):
+    if not isinstance(
+        x, (paddle.Tensor, paddle.static.Variable, paddle.pir.OpResult)
+    ):
         raise TypeError(f"Expected Tensor, but received type of x: {type(x)}")
     dtype = x.dtype
-    is_int_dtype = (
-        dtype == core.VarDesc.VarType.UINT8
-        or dtype == core.VarDesc.VarType.INT8
-        or dtype == core.VarDesc.VarType.INT16
-        or dtype == core.VarDesc.VarType.INT32
-        or dtype == core.VarDesc.VarType.INT64
-    )
+
+    is_int_dtype = False
+    if not in_pir_mode():
+        is_int_dtype = (
+            dtype == core.VarDesc.VarType.UINT8
+            or dtype == core.VarDesc.VarType.INT8
+            or dtype == core.VarDesc.VarType.INT16
+            or dtype == core.VarDesc.VarType.INT32
+            or dtype == core.VarDesc.VarType.INT64
+        )
+    else:
+        is_int_dtype = (
+            dtype == core.DataType.INT8
+            or dtype == core.DataType.INT8
+            or dtype == core.DataType.INT16
+            or dtype == core.DataType.INT32
+            or dtype == core.DataType.INT64
+        )
+
     return is_int_dtype
 
 
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index bb7d0f6142f4e..573ff1bc433be 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1343,8 +1343,8 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
 
     if dtype is None:
         for val in [start, end, step]:
-            if isinstance(val, Variable):
-                if not val.is_integer():
+            if isinstance(val, (Variable, paddle.pir.OpResult)):
+                if not paddle.is_integer(val):
                     dtype = paddle.get_default_dtype()
                     break
                 else:
@@ -1357,35 +1357,35 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
                     dtype = 'int64'
 
     out_shape = None
-    if not in_dynamic_mode() and (
-        not isinstance(start, Variable)
-        and not isinstance(end, Variable)
-        and not isinstance(step, Variable)
+    if not in_dynamic_or_pir_mode() and (
+        not isinstance(start, (Variable, paddle.pir.OpResult))
+        and not isinstance(end, (Variable, paddle.pir.OpResult))
+        and not isinstance(step, (Variable, paddle.pir.OpResult))
     ):
         out_shape = [int(math.ceil((end - start) / step))]
 
-    if not isinstance(dtype, core.VarDesc.VarType):
+    if not isinstance(dtype, (core.VarDesc.VarType, core.DataType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if not isinstance(start, Variable):
+    if not isinstance(start, (Variable, paddle.pir.OpResult)):
         with device_guard("cpu"):
             start = fill_constant([1], dtype, start, force_cpu=True)
     elif start.dtype != dtype:
         start = paddle.cast(start, dtype)
 
-    if not isinstance(end, Variable):
+    if not isinstance(end, (Variable, paddle.pir.OpResult)):
         with device_guard("cpu"):
             end = fill_constant([1], dtype, end, force_cpu=True)
     elif end.dtype != dtype:
         end = paddle.cast(end, dtype)
 
-    if not isinstance(step, Variable):
+    if not isinstance(step, (Variable, paddle.pir.OpResult)):
         with device_guard("cpu"):
             step = fill_constant([1], dtype, step, force_cpu=True)
     elif step.dtype != dtype:
         step = paddle.cast(step, dtype)
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.arange(start, end, step, dtype, _current_expected_place())
     else:
         check_dtype(
diff --git a/test/legacy_test/test_arange.py b/test/legacy_test/test_arange.py
index 9ca8eeee11277..d22ec561e0001 100644
--- a/test/legacy_test/test_arange.py
+++ b/test/legacy_test/test_arange.py
@@ -48,7 +48,7 @@ def init_config(self):
         self.case = (0, 1, 0.2)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
 
 class TestFloatArangeOp(TestArangeOp):
@@ -65,7 +65,7 @@ def init_config(self):
         self.case = (0, 5, 1)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
 
 @unittest.skipIf(
@@ -99,7 +99,7 @@ def init_config(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
 
 class TestInt32ArangeOp(TestArangeOp):
@@ -131,13 +131,15 @@ def init_config(self):
 
 
 class TestArangeOpError(unittest.TestCase):
-    def test_errors(self):
+    def test_static_errors(self):
         with program_guard(Program(), Program()):
+            paddle.enable_static()
             self.assertRaises(TypeError, paddle.arange, 10, dtype='int8')
 
 
 class TestArangeAPI(unittest.TestCase):
     def test_out(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             x1 = paddle.arange(0, 5, 1, 'float32')
 
@@ -152,6 +154,7 @@ def test_out(self):
         expected_data = np.arange(0, 5, 1).astype(np.float32)
         self.assertEqual((out == expected_data).all(), True)
         self.assertListEqual(list(x1.shape), [5])
+        paddle.disable_static(place)
 
 
 class TestArangeImperative(unittest.TestCase):

From 89c29520d1258943a779dd6b75060829bfce8d91 Mon Sep 17 00:00:00 2001
From: megemini <megemini@outlook.com>
Date: Sat, 23 Sep 2023 02:26:29 +0800
Subject: [PATCH 078/115] [Add] check bad req (#57578)

---
 tools/sampcd_processor.py       |  47 ++++++++--
 tools/sampcd_processor_utils.py |  16 ++--
 tools/test_sampcd_processor.py  | 147 +++++++++++++++++++++++++++++++-
 3 files changed, 193 insertions(+), 17 deletions(-)

diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 3eb51371576d1..fdf2f42ad0bb6 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -224,6 +224,7 @@ class BadStatement:
     msg: str = ''
 
     def check(self, docstring: str) -> bool:
+        """Return `True` for bad statement detected."""
         raise NotImplementedError
 
 
@@ -276,6 +277,36 @@ def check(self, docstring):
         return False
 
 
+class DeprecatedRequired(BadStatement):
+    msg = 'Please use `# doctest: +REQUIRES({})` instead of `# {} {}`.'
+
+    _pattern = re.compile(
+        r"""
+        \#
+        \s*
+        (?P<directive>require[sd]?\s*:)
+        (?P<env>.+)
+        """,
+        re.X,
+    )
+
+    def check(self, docstring):
+        for match_obj in self._pattern.finditer(docstring):
+            dep_directive = match_obj.group('directive').strip()
+            dep_env = match_obj.group('env').strip()
+
+            if dep_env:
+                env = 'env:' + ', env:'.join(
+                    [e.strip().upper() for e in dep_env.split(',') if e.strip()]
+                )
+                self.msg = self.__class__.msg.format(
+                    env, dep_directive, dep_env
+                )
+                return True
+
+        return False
+
+
 class Xdoctester(DocTester):
     """A Xdoctest doctester."""
 
@@ -288,6 +319,7 @@ class Xdoctester(DocTester):
     ] = {
         'fluid': (Fluid,),
         'skip': (SkipNoReason,),
+        'require': (DeprecatedRequired,),
     }
 
     def __init__(
@@ -394,11 +426,12 @@ def prepare(self, test_capacity: set):
 
         self._test_capacity = test_capacity
 
-    def _check_bad_statements(self, docstring: str) -> typing.Set[str]:
+    def _check_bad_statements(self, docstring: str) -> typing.Set[BadStatement]:
         bad_results = set()
-        for name, statement_cls in self.bad_statements.items():
-            if statement_cls[0](*statement_cls[1:]).check(docstring):
-                bad_results.add(name)
+        for _, statement_cls in self.bad_statements.items():
+            bad_statement = statement_cls[0](*statement_cls[1:])
+            if bad_statement.check(docstring):
+                bad_results.add(bad_statement)
 
         return bad_results
 
@@ -407,10 +440,8 @@ def run(self, api_name: str, docstring: str) -> typing.List[TestResult]:
         # check bad statements
         bad_results = self._check_bad_statements(docstring)
         if bad_results:
-            for name in bad_results:
-                logger.warning(
-                    "%s >>> %s", api_name, str(self.bad_statements[name][0].msg)
-                )
+            for bad_statement in bad_results:
+                logger.warning("%s >>> %s", api_name, bad_statement.msg)
 
             return [
                 TestResult(
diff --git a/tools/sampcd_processor_utils.py b/tools/sampcd_processor_utils.py
index a9ac35c0a7336..209fca88bb635 100644
--- a/tools/sampcd_processor_utils.py
+++ b/tools/sampcd_processor_utils.py
@@ -108,7 +108,7 @@ def cls_map(mcs) -> typing.Dict[str, Result]:
         return mcs.__cls_map
 
 
-class Passed(Result, metaclass=MetaResult):
+class RPassed(Result, metaclass=MetaResult):
     name = 'passed'
     is_fail = False
 
@@ -117,7 +117,7 @@ def msg(cls, count, env):
         return f">>> {count} sample codes ran success in env: {env}"
 
 
-class Skipped(Result, metaclass=MetaResult):
+class RSkipped(Result, metaclass=MetaResult):
     name = 'skipped'
     is_fail = False
     logger = logger.warning
@@ -127,7 +127,7 @@ def msg(cls, count, env):
         return f">>> {count} sample codes skipped in env: {env}"
 
 
-class Failed(Result, metaclass=MetaResult):
+class RFailed(Result, metaclass=MetaResult):
     name = 'failed'
     is_fail = True
     logger = logger.error
@@ -137,7 +137,7 @@ def msg(cls, count, env):
         return f">>> {count} sample codes ran failed in env: {env}"
 
 
-class NoCode(Result, metaclass=MetaResult):
+class RNoCode(Result, metaclass=MetaResult):
     name = 'nocode'
     is_fail = True
     logger = logger.error
@@ -147,7 +147,7 @@ def msg(cls, count, env):
         return f">>> {count} apis don't have sample codes or could not run test in env: {env}"
 
 
-class Timeout(Result, metaclass=MetaResult):
+class RTimeout(Result, metaclass=MetaResult):
     name = 'timeout'
     is_fail = True
     logger = logger.error
@@ -157,7 +157,7 @@ def msg(cls, count, env):
         return f">>> {count} sample codes ran timeout or error in env: {env}"
 
 
-class BadStatement(Result, metaclass=MetaResult):
+class RBadStatement(Result, metaclass=MetaResult):
     name = 'badstatement'
     is_fail = True
     logger = logger.error
@@ -199,8 +199,8 @@ def __init__(self, **kwargs) -> None:
 
         if self.__unique_state is None:
             logger.warning('Default result will be set to FAILED!')
-            setattr(self, Failed.name, True)
-            self.__unique_state = Failed
+            setattr(self, RFailed.name, True)
+            self.__unique_state = RFailed
 
     @property
     def state(self) -> Result:
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index 4a4804ac2a4c8..34f55fb02ceb0 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -2320,6 +2320,16 @@ def test_bad_statements(self):
                     >>> # import paddle.fluid
                     >>> import os
             """,
+            'oneline_skip': """
+            this is docstring...
+
+            Examples:
+
+                .. code-block:: python
+
+                    >>> import os # doctest: +SKIP
+                    >>> import sys
+            """,
         }
 
         _clear_environ()
@@ -2329,7 +2339,7 @@ def test_bad_statements(self):
         doctester.prepare(test_capacity)
 
         test_results = get_test_results(doctester, docstrings_to_test)
-        self.assertEqual(len(test_results), 10)
+        self.assertEqual(len(test_results), 11)
 
         (
             tr_0,
@@ -2342,6 +2352,7 @@ def test_bad_statements(self):
             tr_7,
             tr_8,
             tr_9,
+            tr_10,
         ) = test_results
 
         self.assertIn('bad_fluid', tr_0.name)
@@ -2385,6 +2396,140 @@ def test_bad_statements(self):
         self.assertFalse(tr_9.badstatement)
         self.assertTrue(tr_9.passed)
 
+        self.assertIn('oneline_skip', tr_10.name)
+        self.assertTrue(tr_10.badstatement)
+        self.assertFalse(tr_10.passed)
+
+    def test_bad_statements_req(self):
+        docstrings_to_test = {
+            'bad_required': """
+            this is docstring...
+
+            Examples:
+
+                .. code-block:: python
+
+                    >>> import sys
+                    >>> # required: GPU
+                    >>> import os
+            """,
+            'bad_requires': """
+            this is docstring...
+
+            Examples:
+
+                .. code-block:: python
+
+                    >>> import sys
+                    >>> # requires: GPU
+                    >>> import os
+            """,
+            'bad_require': """
+            this is docstring...
+
+            Examples:
+
+                .. code-block:: python
+
+                    >>> import sys
+                    >>> # require   :   GPU
+                    >>> import os
+            """,
+            'bad_require_2': """
+            this is docstring...
+
+            Examples:
+
+                .. code-block:: python
+
+                    >>> import sys
+                    >>> # require: GPU, xpu
+                    >>> import os
+            """,
+            'bad_req': """
+            this is docstring...
+
+            Examples:
+
+                .. code-block:: python
+
+                    >>> import sys
+                    >>> #require:gpu
+                    >>> import os
+            """,
+            'ignore_req': """
+            this is docstring...
+
+            Examples:
+
+                .. code-block:: python
+
+                    >>> import sys
+                    >>> #require:
+                    >>> import os
+            """,
+            'ignore_req_bad_req': """
+            this is docstring...
+
+            Examples:
+
+                .. code-block:: python
+
+                    >>> import sys
+                    >>> #require: xpu
+                    >>> import os
+                    >>> #require:
+                    >>> import os
+            """,
+        }
+
+        _clear_environ()
+
+        test_capacity = {'cpu'}
+        doctester = Xdoctester()
+        doctester.prepare(test_capacity)
+
+        test_results = get_test_results(doctester, docstrings_to_test)
+        self.assertEqual(len(test_results), 7)
+
+        (
+            tr_0,
+            tr_1,
+            tr_2,
+            tr_3,
+            tr_4,
+            tr_5,
+            tr_6,
+        ) = test_results
+
+        self.assertIn('bad_required', tr_0.name)
+        self.assertTrue(tr_0.badstatement)
+        self.assertFalse(tr_0.passed)
+
+        self.assertIn('bad_requires', tr_1.name)
+        self.assertTrue(tr_1.badstatement)
+        self.assertFalse(tr_1.passed)
+
+        self.assertIn('bad_require', tr_2.name)
+        self.assertTrue(tr_1.badstatement)
+        self.assertFalse(tr_1.passed)
+
+        self.assertIn('bad_require_2', tr_3.name)
+        self.assertTrue(tr_3.badstatement)
+        self.assertFalse(tr_3.passed)
+
+        self.assertIn('bad_req', tr_4.name)
+        self.assertTrue(tr_4.badstatement)
+        self.assertFalse(tr_4.passed)
+
+        self.assertIn('ignore_req', tr_5.name)
+        self.assertFalse(tr_5.badstatement)
+        self.assertTrue(tr_5.passed)
+
+        self.assertIn('ignore_req_bad_req', tr_6.name)
+        self.assertTrue(tr_6.badstatement)
+        self.assertFalse(tr_6.passed)
+
 
 if __name__ == '__main__':
     unittest.main()

From 9985598affbb3f834fdd6d1d0afbaeefcd2b45d3 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Sat, 23 Sep 2023 11:17:33 +0800
Subject: [PATCH 079/115] [SOT] rewrite eval frame in C-lang (#57653)

---
 .pre-commit-config.yaml                |   4 +-
 paddle/fluid/pybind/CMakeLists.txt     |   3 +-
 paddle/fluid/pybind/eval_frame.c       | 577 +++++++++++++++++++++++++
 paddle/fluid/pybind/eval_frame.h       | 118 +++++
 paddle/fluid/pybind/jit.cc             | 562 +-----------------------
 paddle/fluid/pybind/jit.h              |  92 ----
 test/dygraph_to_static/test_pylayer.py |   8 +-
 7 files changed, 709 insertions(+), 655 deletions(-)
 create mode 100644 paddle/fluid/pybind/eval_frame.c
 create mode 100644 paddle/fluid/pybind/eval_frame.h

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a99344e94c309..1d8ff330bc18b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -94,9 +94,9 @@ repos:
         description: Check C++ code style using cpplint.py.
         entry: bash ./tools/codestyle/cpplint_pre_commit.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+        files: \.(cc|cxx|cpp|cu|h|hpp|hxx)$
         args:
-            - --extensions=c,cc,cxx,cpp,cu,cuh,h,hpp,hxx,kps
+            - --extensions=cc,cxx,cpp,cu,cuh,h,hpp,hxx,kps
             - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens
             - --quiet
         # Exclude third-party libraries
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 22e48b0b6a075..1a919956a2c30 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -144,7 +144,8 @@ set(PYBIND_SRCS
     custom_device_py.cc
     xpu_streams_py.cc
     jit.cc
-    auto_parallel_py.cc)
+    auto_parallel_py.cc
+    eval_frame.c)
 
 if(WITH_CUSTOM_DEVICE)
   set(PYBIND_DEPS ${PYBIND_DEPS} custom_device_common_op_registry)
diff --git a/paddle/fluid/pybind/eval_frame.c b/paddle/fluid/pybind/eval_frame.c
new file mode 100644
index 0000000000000..0254e1dce1f0a
--- /dev/null
+++ b/paddle/fluid/pybind/eval_frame.c
@@ -0,0 +1,577 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/eval_frame.h"
+
+#include <Python.h>
+#include <frameobject.h>
+
+#if PY_VERSION_HEX < 0x030b0000
+#include <code.h>
+#endif
+#if PY_VERSION_HEX >= 0x030b0000
+#include <internal/pycore_code.h>
+#include <internal/pycore_frame.h>
+#define Py_BUILD_CORE       // internal/pycore_opcode.h need this macro
+#define NEED_OPCODE_TABLES  // To get _PyOpcode_Caches and _PyOpcode_Deopt
+#include <internal/pycore_opcode.h>
+#undef NEED_OPCODE_TABLES
+#undef Py_BUILD_CORE
+#include <opcode.h>
+#endif
+
+#include <object.h>
+#include <pystate.h>
+
+#if PY_VERSION_HEX >= 0x030b0000
+// To avoid the error: undefined symbol: _PyFrame_GetFrameObject, all we need is
+// to redefine this function based source code in python3.11. The advantage is
+// that we don't need any modification in eval_frame functions.
+typedef _PyInterpreterFrame FrameObject;
+#define CALL_STAT_INC(name) ((void)0)
+
+int Internal_PyInterpreterFrame_GetLine(_PyInterpreterFrame *frame);
+static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame,
+                                         int opcode,
+                                         int oparg);
+int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame);
+
+// clang-format off
+// Define a proxy PyObject to access _PyInterpreterFrame's properties.
+// It will be passed as an argument to the eval frame's callback.
+typedef struct PyInterpreterFrameProxy {
+  PyObject_HEAD
+  _PyInterpreterFrame *frame;
+} PyInterpreterFrameProxy;
+// clang-format on
+
+#define DECLARE_PROXY_PROPERTY(name)                        \
+  static PyObject *PyInterpreterFrameProxy_property_##name( \
+      PyInterpreterFrameProxy *self, void *closure) {       \
+    Py_XINCREF(self->frame->name);                          \
+    return (PyObject *)self->frame->name;                   \
+  }
+
+// clang-format off
+#define REGISTER_PROXY_PROPERTY(name) \
+  { #name, (getter)PyInterpreterFrameProxy_property_##name, NULL, NULL, NULL }
+// clang-format on
+
+DECLARE_PROXY_PROPERTY(f_code)
+DECLARE_PROXY_PROPERTY(f_locals)
+DECLARE_PROXY_PROPERTY(f_globals)
+DECLARE_PROXY_PROPERTY(f_builtins)
+
+// Refer to
+// https://github.com/python/cpython/blob/9414ddf91898892f3f6a672ae946931ee4b3ceb7/Objects/frameobject.c#L953-L961
+static PyObject *PyInterpreterFrameProxy_method_repr(
+    PyInterpreterFrameProxy *self) {
+  int lineno = Internal_PyInterpreterFrame_GetLine(self->frame);
+  PyCodeObject *code = self->frame->f_code;
+  return PyUnicode_FromFormat(
+      "<PyInterpreterFrameProxy at %p, file %R, line %d, code %S>",
+      self,
+      code->co_filename,
+      lineno,
+      code->co_name);
+}
+
+static PyGetSetDef PyInterpreterFrameProxy_properties[] = {
+    REGISTER_PROXY_PROPERTY(f_code),
+    REGISTER_PROXY_PROPERTY(f_locals),
+    REGISTER_PROXY_PROPERTY(f_globals),
+    REGISTER_PROXY_PROPERTY(f_builtins),
+    {NULL} /* Sentinel */
+};
+
+// clang-format off
+static PyTypeObject PyInterpreterFrameProxyType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "paddle.framework.core.PyInterpreterFrameProxy",
+    .tp_doc = PyDoc_STR("A proxy object for _PyInterpreterFrame, "
+                        "it's only define all properties we need."),
+    .tp_repr = (reprfunc)PyInterpreterFrameProxy_method_repr,
+    .tp_basicsize = sizeof(PyInterpreterFrameProxy),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_getset = PyInterpreterFrameProxy_properties,
+};
+// clang-format on
+
+PyInterpreterFrameProxy *PyInterpreterFrameProxy_New(
+    _PyInterpreterFrame *frame) {
+  PyTypeObject *type = &PyInterpreterFrameProxyType;
+  PyInterpreterFrameProxy *self =
+      (PyInterpreterFrameProxy *)type->tp_alloc(type, 0);
+  if (!self) {
+    // VLOG(7) << "Failed to allocate PyInterpreterFrameProxy";
+    return NULL;
+  }
+  self->frame = frame;
+  return self;
+}
+
+// We copy some cpython internal API from cpython project.
+// To avoid name conflict, we use "Internal_" prefix to mark them.
+int Internal_PyInterpreterFrame_GetLine(_PyInterpreterFrame *frame) {
+  int addr = _PyInterpreterFrame_LASTI(frame) * sizeof(_Py_CODEUNIT);
+  return PyCode_Addr2Line(frame->f_code, addr);
+}
+
+static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame,
+                                         int opcode,
+                                         int oparg) {
+  // This only works when opcode is a non-quickened form:
+  assert(_PyOpcode_Deopt[opcode] == opcode);
+  int check_oparg = 0;
+  for (_Py_CODEUNIT *instruction = _PyCode_CODE(frame->f_code);
+       instruction < frame->prev_instr;
+       instruction++) {
+    int check_opcode = _PyOpcode_Deopt[_Py_OPCODE(*instruction)];
+    check_oparg |= _Py_OPARG(*instruction);
+    if (check_opcode == opcode && check_oparg == oparg) {
+      return 1;
+    }
+    if (check_opcode == EXTENDED_ARG) {
+      check_oparg <<= 8;
+    } else {
+      check_oparg = 0;
+    }
+    instruction += _PyOpcode_Caches[check_opcode];
+  }
+  return 0;
+}
+
+int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
+  /* Merge fast locals into f->f_locals */
+  PyObject *locals;
+  PyObject **fast;
+  PyCodeObject *co;
+  locals = frame->f_locals;
+  if (locals == NULL) {
+    locals = frame->f_locals = PyDict_New();
+    if (locals == NULL) return -1;
+  }
+  co = frame->f_code;
+  fast = _PyFrame_GetLocalsArray(frame);
+  // COPY_FREE_VARS has no quickened forms, so no need to use _PyOpcode_Deopt
+  // here:
+  int lasti = _PyInterpreterFrame_LASTI(frame);
+  if (lasti < 0 && _Py_OPCODE(_PyCode_CODE(co)[0]) == COPY_FREE_VARS) {
+    /* Free vars have not been initialized -- Do that */
+    PyCodeObject *co = frame->f_code;
+    PyObject *closure = frame->f_func->func_closure;
+    int offset = co->co_nlocals + co->co_nplaincellvars;
+    for (int i = 0; i < co->co_nfreevars; ++i) {
+      PyObject *o = PyTuple_GET_ITEM(closure, i);
+      Py_INCREF(o);
+      frame->localsplus[offset + i] = o;
+    }
+    // COPY_FREE_VARS doesn't have inline CACHEs, either:
+    frame->prev_instr = _PyCode_CODE(frame->f_code);
+  }
+  for (int i = 0; i < co->co_nlocalsplus; i++) {
+    _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
+
+    /* If the namespace is unoptimized, then one of the
+       following cases applies:
+       1. It does not contain free variables, because it
+          uses import * or is a top-level namespace.
+       2. It is a class namespace.
+       We don't want to accidentally copy free variables
+       into the locals dict used by the class.
+    */
+    if (kind & CO_FAST_FREE && !(co->co_flags & CO_OPTIMIZED)) {
+      continue;
+    }
+
+    PyObject *name = PyTuple_GET_ITEM(co->co_localsplusnames, i);
+    PyObject *value = fast[i];
+    if (frame->stacktop) {
+      if (kind & CO_FAST_FREE) {
+        // The cell was set by COPY_FREE_VARS.
+        assert(value != NULL && PyCell_Check(value));
+        value = PyCell_GET(value);
+      } else if (kind & CO_FAST_CELL) {
+        // Note that no *_DEREF ops can happen before MAKE_CELL
+        // executes.  So there's no need to duplicate the work
+        // that MAKE_CELL would otherwise do later, if it hasn't
+        // run yet.
+        if (value != NULL) {
+          if (PyCell_Check(value) &&
+              Internal_PyFrame_OpAlreadyRan(frame, MAKE_CELL, i)) {
+            // (likely) MAKE_CELL must have executed already.
+            value = PyCell_GET(value);
+          }
+          // (likely) Otherwise it it is an arg (kind & CO_FAST_LOCAL),
+          // with the initial value set when the frame was created...
+          // (unlikely) ...or it was set to some initial value by
+          // an earlier call to PyFrame_LocalsToFast().
+        }
+      }
+    } else {
+      assert(value == NULL);
+    }
+    if (value == NULL) {
+      if (PyObject_DelItem(locals, name) != 0) {
+        if (PyErr_ExceptionMatches(PyExc_KeyError)) {
+          PyErr_Clear();
+        } else {
+          return -1;
+        }
+      }
+    } else {
+      if (PyObject_SetItem(locals, name, value) != 0) {
+        return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+#else
+typedef PyFrameObject FrameObject;
+#endif
+
+#ifdef _WIN32
+#define unlikely(x) (x)
+#else
+#define unlikely(x) __builtin_expect((x), 0)
+#endif
+
+// Use static variable to save customed eval hook.
+static Py_tss_t eval_frame_callback_key = {0, 0};
+
+inline static PyObject *eval_frame_callback_get() {
+  void *result = PyThread_tss_get(&eval_frame_callback_key);
+  if (unlikely(result == NULL)) {
+    Py_RETURN_NONE;
+  } else {
+    return (PyObject *)result;
+  }
+}
+
+inline static void eval_frame_callback_set(PyObject *obj) {
+  PyThread_tss_set(&eval_frame_callback_key, obj);
+}
+
+// call python default eval frame to interpret current frame.
+inline static PyObject *eval_frame_default(PyThreadState *tstate,
+                                           FrameObject *frame,
+                                           int throw_flag) {
+#if PY_VERSION_HEX >= 0x03090000
+  if (tstate == NULL) {
+    tstate = PyThreadState_GET();
+  }
+  return _PyEval_EvalFrameDefault(tstate, frame, throw_flag);
+#else
+  return _PyEval_EvalFrameDefault(frame, throw_flag);
+#endif
+}
+
+#if PY_VERSION_HEX >= 0x030b0000
+
+inline static PyObject *eval_custom_code_py311_plus(PyThreadState *tstate,
+                                                    FrameObject *frame,
+                                                    PyCodeObject *code,
+                                                    int throw_flag) {
+  Py_ssize_t nlocalsplus_new = code->co_nlocalsplus;
+  Py_ssize_t nlocalsplus_old = frame->f_code->co_nlocalsplus;
+  // Create a new PyInterpreterFrame. Refer to CALL.
+  // PyInterpreterFrame has a head section calls "specials". It follows
+  // a contiguous section containing localplus and interpreter stack space.
+  size_t size = nlocalsplus_new + code->co_stacksize + FRAME_SPECIALS_SIZE;
+  CALL_STAT_INC(frames_pushed);
+  _PyInterpreterFrame *shadow =
+      (_PyInterpreterFrame *)malloc(sizeof(PyObject *) * size);
+  if (shadow == NULL) {
+    // VLOG(7) << "Failed to allocate memory for shadow frame.";
+    return NULL;
+  }
+  // Create a new function object from code object. Refer to MAKE_FUNCTION.
+  PyFunctionObject *func =
+      (PyFunctionObject *)PyFunction_New((PyObject *)code, frame->f_globals);
+  Py_XINCREF(frame->f_func->func_closure);
+  func->func_closure = frame->f_func->func_closure;
+  _PyFrame_InitializeSpecials(shadow, func, NULL, code->co_nlocalsplus);
+
+  PyObject **fastlocals_old = frame->localsplus;
+  PyObject **fastlocals_new = shadow->localsplus;
+
+  for (Py_ssize_t i = 0; i < nlocalsplus_new; ++i) {
+    fastlocals_new[i] = NULL;
+  }
+
+  // The namemap to map the name to index in new frame localsplus.
+  PyObject *namemap = PyDict_New();
+  if (namemap == NULL) {
+    // VLOG(7) << "Failed to create namemap.";
+    free(shadow);
+    return NULL;
+  }
+  for (Py_ssize_t i = 0; i < nlocalsplus_new; ++i) {
+    PyObject *name = PyTuple_GET_ITEM(code->co_localsplusnames, i);
+    PyObject *index = PyLong_FromSize_t(i);
+    PyDict_SetItem(namemap, name, index);
+  }
+  for (Py_ssize_t i = 0; i < nlocalsplus_old; ++i) {
+    PyObject *name = PyTuple_GET_ITEM(frame->f_code->co_localsplusnames, i);
+    PyObject *index = PyDict_GetItem(namemap, name);
+    if (index == NULL) {
+      continue;
+    }
+    Py_XINCREF(fastlocals_old[i]);
+    fastlocals_new[PyLong_AsSize_t(index)] = fastlocals_old[i];
+  }
+
+  PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
+  free(shadow);
+  Py_DECREF(namemap);
+  return result;
+}
+
+#else
+
+inline static PyObject *eval_custom_code_py310_minus(PyThreadState *tstate,
+                                                     FrameObject *frame,
+                                                     PyCodeObject *code,
+                                                     int throw_flag) {
+  Py_ssize_t ncells = 0;
+  Py_ssize_t nfrees = 0;
+  Py_ssize_t nlocals_new = code->co_nlocals;
+  Py_ssize_t nlocals_old = frame->f_code->co_nlocals;
+
+  ncells = PyTuple_GET_SIZE(code->co_cellvars);
+  nfrees = PyTuple_GET_SIZE(code->co_freevars);
+
+  PyFrameObject *shadow = PyFrame_New(tstate, code, frame->f_globals, NULL);
+  if (shadow == NULL) {
+    return NULL;
+  }
+
+  PyObject **fastlocals_old = frame->f_localsplus;
+  PyObject **fastlocals_new = shadow->f_localsplus;
+
+  for (Py_ssize_t i = 0; i < nlocals_old; i++) {
+    Py_XINCREF(fastlocals_old[i]);
+    fastlocals_new[i] = fastlocals_old[i];
+  }
+
+  for (Py_ssize_t i = 0; i < ncells + nfrees; i++) {
+    Py_XINCREF(fastlocals_old[nlocals_old + i]);
+    fastlocals_new[nlocals_new + i] = fastlocals_old[nlocals_old + i];
+  }
+
+  PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
+  Py_DECREF(shadow);
+  return result;
+}
+
+#endif
+
+// Start a new frame and run code in this frame.
+// Execute a piece of code by default frame-hook.
+inline static PyObject *eval_custom_code(PyThreadState *tstate,
+                                         FrameObject *frame,
+                                         PyCodeObject *code,
+                                         int throw_flag) {
+#if PY_VERSION_HEX >= 0x030b0000
+  return eval_custom_code_py311_plus(tstate, frame, code, throw_flag);
+#else
+  return eval_custom_code_py310_minus(tstate, frame, code, throw_flag);
+#endif
+}
+
+static PyObject *_custom_eval_frame(PyThreadState *tstate,
+                                    FrameObject *frame,
+                                    int throw_flag,
+                                    PyObject *callback) {
+// https://peps.python.org/pep-0558/#fast-locals-proxy-implementation-details
+// https://devguide.python.org/internals/interpreter/#all-sorts-of-variables
+#if PY_VERSION_HEX >= 0x030b0000
+  if (frame->owner == FRAME_OWNED_BY_GENERATOR) {
+    return eval_frame_default(tstate, frame, throw_flag);
+  }
+  // PyFrame_FastToLocalsWithError receives a PyFrameObject, but if we created a
+  // PyFrameObject from a PyInterpreterFrame, it will changes the original
+  // PyInterpreterFrame and causes a Segmentation Fault when Fallback to run
+  // original frame. So we pass a PyInterpreterFrame to
+  // _PyFrame_FastToLocalsWithError directly. But this is an internal API, so we
+  // copy many code from CPython project into our project.
+  if (Internal_PyFrame_FastToLocalsWithError(frame) < 0) {
+#else
+  if (PyFrame_FastToLocalsWithError(frame) < 0) {
+#endif
+    return NULL;
+  }
+
+  // NOTE:(xiongkun): Handle GeneratorExit exception: (Spend a day)
+  // In Python, gen close is also a Python function call that will enter this
+  // function with GeneratorExit set, which will cause the PyObject_CallObject
+  // raise SystemError. So we disable the custom behavior for GeneratorExit. def
+  // func():
+  //     iter = iter([1, 2, 3])
+  //     for i in iter:
+  //         return i # <--- Early return, cause a GeneratorExit thrown,
+  //                  # <--- which Cause the PyObject_CallObject raise
+  //                  SystemError.
+  if (PyErr_ExceptionMatches(PyExc_GeneratorExit)) {
+    return eval_frame_default(tstate, frame, throw_flag);
+  }
+
+  // We don't run the current custom_eval_frame behavior for guards.
+  // So we temporarily set the callback to Py_None to drive the correct behavior
+  // in the shim.
+  eval_frame_callback_set(Py_None);
+
+#if PY_VERSION_HEX >= 0x030b0000
+  PyObject *args = Py_BuildValue("(O)", PyInterpreterFrameProxy_New(frame));
+#else
+  PyObject *args = Py_BuildValue("(O)", frame);
+#endif
+  PyObject *result = PyObject_CallObject(callback, args);
+  Py_DECREF(args);
+  // VLOG(7) << "After call eval_frame_function and decrease frame.";
+  // class CustomCode(Protocal):
+  //     code: CodeType | None
+  //     disable_eval_frame: bool
+  // result: CustomCode
+  if (result == NULL) {
+    // internal exception
+    // VLOG(7) << "Error happened.";
+    return NULL;
+  } else {
+    //  NOTE: Cache is not supported now
+    PyCodeObject *code = (PyCodeObject *)PyObject_GetAttrString(result, "code");
+    PyObject *disable_eval_frame =
+        PyObject_GetAttrString(result, "disable_eval_frame");
+    PyObject *out;
+    // VLOG(7) << "Start eval new frame and code.";
+    if (disable_eval_frame != Py_True) {
+      // Re-enable custom behavior
+      eval_frame_callback_set(callback);
+      if ((PyObject *)code != Py_None) {
+        out = eval_custom_code(tstate, frame, code, throw_flag);
+      } else {
+        out = eval_frame_default(tstate, frame, throw_flag);
+      }
+    } else {
+      if ((PyObject *)code != Py_None) {
+        out = eval_custom_code(tstate, frame, code, throw_flag);
+      } else {
+        out = eval_frame_default(tstate, frame, throw_flag);
+      }
+      // Re-enable custom behavior
+      eval_frame_callback_set(callback);
+    }
+    Py_DECREF(result);
+    Py_DECREF(code);
+    return out;
+  }
+}
+
+static PyObject *_custom_eval_frame_shim(PyThreadState *tstate,
+                                         FrameObject *frame,
+                                         int throw_flag) {
+  PyObject *callback = eval_frame_callback_get();
+
+  if (callback == Py_None) {
+    return eval_frame_default(tstate, frame, throw_flag);
+  }
+
+  return _custom_eval_frame(tstate, frame, throw_flag, callback);
+}
+
+#if PY_VERSION_HEX >= 0x03090000
+static PyObject *custom_eval_frame_shim(PyThreadState *tstate,
+                                        FrameObject *frame,
+                                        int throw_flag) {
+  return _custom_eval_frame_shim(tstate, frame, throw_flag);
+}
+#else
+static PyObject *custom_eval_frame_shim(FrameObject *frame, int throw_flag) {
+  PyThreadState *tstate = PyThreadState_GET();
+  return _custom_eval_frame_shim(tstate, frame, throw_flag);
+}
+#endif
+
+static PyObject *set_eval_frame(PyObject *new_callback, PyThreadState *tstate) {
+  // Change the eval frame callback and return the old one
+  //  - None: disables: disable custom callback.
+  //  - Python callable(): enables custom callback.
+  //  NOTE: Cache is not supported now
+  PyObject *old_callback = eval_frame_callback_get();
+
+#if PY_VERSION_HEX >= 0x03090000
+  _PyFrameEvalFunction old_eval_frame =
+      _PyInterpreterState_GetEvalFrameFunc(tstate->interp);
+#else
+  // Function pointer.
+  _PyFrameEvalFunction old_eval_frame = tstate->interp->eval_frame;
+#endif
+
+  // NOTE: multi-threading is not supported now
+  if (old_callback != Py_None && new_callback == Py_None) {
+    if (old_eval_frame != &_PyEval_EvalFrameDefault) {
+      // VLOG(7) << "set _PyEval_EvalFrameDefault";
+#if PY_VERSION_HEX >= 0x03090000
+      _PyInterpreterState_SetEvalFrameFunc(tstate->interp,
+                                           &_PyEval_EvalFrameDefault);
+#else
+      tstate->interp->eval_frame = &_PyEval_EvalFrameDefault;
+#endif
+    }
+  } else if (old_callback == Py_None && new_callback != Py_None) {
+    if (old_eval_frame != &custom_eval_frame_shim) {
+      // VLOG(7) << "set custom_eval_frame_shim";
+#if PY_VERSION_HEX >= 0x03090000
+      _PyInterpreterState_SetEvalFrameFunc(tstate->interp,
+                                           &custom_eval_frame_shim);
+#else
+      tstate->interp->eval_frame = &custom_eval_frame_shim;
+#endif
+    }
+  }
+
+  Py_INCREF(new_callback);
+  eval_frame_callback_set(new_callback);
+
+  return old_callback;
+}
+
+PyObject *set_eval_frame_py(PyObject *callback) {
+  if (callback != Py_None && !PyCallable_Check(callback)) {
+    // VLOG(7) << "callback is not a callable or none, invalid arguments.";
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  return set_eval_frame(callback, PyThreadState_GET());
+}
+
+PyMODINIT_FUNC PyInit__eval_frame() {
+  PyThread_tss_create(&eval_frame_callback_key);
+  // VLOG(7) << "Set PyThread_tss_create return: " << result;
+
+  Py_INCREF(Py_None);
+  eval_frame_callback_set(Py_None);
+
+#if PY_VERSION_HEX >= 0x030b0000
+  if (PyType_Ready(&PyInterpreterFrameProxyType) < 0) {
+    // VLOG(7) << "PyInterpreterFrameProxyType has not been ready!";
+  }
+  Py_INCREF(&PyInterpreterFrameProxyType);
+#endif
+
+  return NULL;
+}
diff --git a/paddle/fluid/pybind/eval_frame.h b/paddle/fluid/pybind/eval_frame.h
new file mode 100644
index 0000000000000..383d2d7a5bc00
--- /dev/null
+++ b/paddle/fluid/pybind/eval_frame.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+
+// see https://bugs.python.org/issue35886
+// If py_version==3.8.*, we need to redefine _PyEvalFrameFunc and the
+// related functions and structs.
+
+#if PY_VERSION_HEX >= 0x03080000 && PY_VERSION_HEX < 0x3090000
+
+typedef PyObject *(*_PyFrameEvalFunction)(struct _frame *, int);
+
+struct _warnings_runtime_state {
+  /* Both 'filters' and 'onceregistry' can be set in warnings.py;
+     get_warnings_attr() will reset these variables accordingly. */
+  PyObject *filters;        /* List */
+  PyObject *once_registry;  /* Dict */
+  PyObject *default_action; /* String */
+  long filters_version;     // NOLINT
+};
+
+struct _is {
+  struct _is *next;
+  struct _ts *tstate_head;
+
+  int64_t id;
+  int64_t id_refcount;
+  int requires_idref;
+  PyThread_type_lock id_mutex;
+
+  int finalizing;
+
+  PyObject *modules;
+  PyObject *modules_by_index;
+  PyObject *sysdict;
+  PyObject *builtins;
+  PyObject *importlib;
+
+  /* Used in Python/sysmodule.c. */
+  int check_interval;
+
+  /* Used in Modules/_threadmodule.c. */
+  long num_threads;  // NOLINT
+  /* Support for runtime thread stack size tuning.
+     A value of 0 means using the platform's default stack size
+     or the size specified by the THREAD_STACK_SIZE macro. */
+  /* Used in Python/thread.c. */
+  size_t pythread_stacksize;
+
+  PyObject *codec_search_path;
+  PyObject *codec_search_cache;
+  PyObject *codec_error_registry;
+  int codecs_initialized;
+
+  /* fs_codec.encoding is initialized to NULL.
+     Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
+  struct {
+    char *encoding; /* Filesystem encoding (encoded to UTF-8) */
+    char *errors;   /* Filesystem errors (encoded to UTF-8) */
+    _Py_error_handler error_handler;
+  } fs_codec;
+
+  PyConfig config;
+#ifdef HAVE_DLOPEN
+  int dlopenflags;
+#endif
+
+  PyObject *dict; /* Stores per-interpreter state */
+
+  PyObject *builtins_copy;
+  PyObject *import_func;
+  /* Initialized to PyEval_EvalFrameDefault(). */
+  _PyFrameEvalFunction eval_frame;
+
+  Py_ssize_t co_extra_user_count;
+  freefunc co_extra_freefuncs[MAX_CO_EXTRA_USERS];
+
+#ifdef HAVE_FORK
+  PyObject *before_forkers;
+  PyObject *after_forkers_parent;
+  PyObject *after_forkers_child;
+#endif
+  /* AtExit module */
+  void (*pyexitfunc)(PyObject *);
+  PyObject *pyexitmodule;
+
+  uint64_t tstate_next_unique_id;
+
+  struct _warnings_runtime_state warnings;
+
+  PyObject *audit_hooks;
+};
+
+#endif
+
+PyObject *set_eval_frame_py(PyObject *callback);
+PyMODINIT_FUNC PyInit__eval_frame();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc
index 8d4d62f738211..09e194bf0b7c8 100644
--- a/paddle/fluid/pybind/jit.cc
+++ b/paddle/fluid/pybind/jit.cc
@@ -13,36 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/pybind/jit.h"
-
-#include <Python.h>
-#include <frameobject.h>
-
-#if PY_VERSION_HEX < 0x030b0000
-#include <code.h>
-#endif
-#if PY_VERSION_HEX >= 0x030b0000
-#include <internal/pycore_code.h>
-#include <internal/pycore_frame.h>
-#define Py_BUILD_CORE       // internal/pycore_opcode.h need this macro
-#define NEED_OPCODE_TABLES  // To get _PyOpcode_Caches and _PyOpcode_Deopt
-#include <internal/pycore_opcode.h>
-#undef NEED_OPCODE_TABLES
-#undef Py_BUILD_CORE
-#include <opcode.h>
-#endif
-
-#include <object.h>
-#include <pystate.h>
-
+#include "glog/logging.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/platform/place.h"
-
-#include "glog/logging.h"
 #include "paddle/fluid/jit/function.h"
 #include "paddle/fluid/jit/function_schema.h"
 #include "paddle/fluid/jit/layer.h"
 #include "paddle/fluid/jit/serializer.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/pybind/eval_frame.h"
 #include "paddle/utils/pybind.h"
 
 namespace py = pybind11;
@@ -50,535 +29,6 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-#if PY_VERSION_HEX >= 0x030b0000
-// To avoid the error: undefined symbol: _PyFrame_GetFrameObject, all we need is
-// to redefine this function based source code in python3.11. The advantage is
-// that we don't need any modification in eval_frame functions.
-typedef _PyInterpreterFrame FrameObject;
-#define CALL_STAT_INC(name) ((void)0)
-
-int Internal_PyInterpreterFrame_GetLine(_PyInterpreterFrame *frame);
-static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame,
-                                         int opcode,
-                                         int oparg);
-int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame);
-
-// clang-format off
-// Define a proxy PyObject to access _PyInterpreterFrame's properties.
-// It will be passed as an argument to the eval frame's callback.
-typedef struct PyInterpreterFrameProxy {
-  PyObject_HEAD
-  _PyInterpreterFrame *frame;
-} PyInterpreterFrameProxy;
-// clang-format on
-
-#define DECLARE_PROXY_PROPERTY(name)                        \
-  static PyObject *PyInterpreterFrameProxy_property_##name( \
-      PyInterpreterFrameProxy *self, void *closure) {       \
-    Py_XINCREF(self->frame->name);                          \
-    return reinterpret_cast<PyObject *>(self->frame->name); \
-  }
-
-// clang-format off
-#define REGISTER_PROXY_PROPERTY(name)                                         \
-  {                                                                           \
-    #name, (getter)PyInterpreterFrameProxy_property_##name, nullptr, nullptr, \
-        nullptr                                                               \
-  }
-// clang-format on
-
-DECLARE_PROXY_PROPERTY(f_code)
-DECLARE_PROXY_PROPERTY(f_locals)
-DECLARE_PROXY_PROPERTY(f_globals)
-DECLARE_PROXY_PROPERTY(f_builtins)
-
-// Refer to
-// https://github.com/python/cpython/blob/9414ddf91898892f3f6a672ae946931ee4b3ceb7/Objects/frameobject.c#L953-L961
-static PyObject *PyInterpreterFrameProxy_method_repr(
-    PyInterpreterFrameProxy *self) {
-  int lineno = Internal_PyInterpreterFrame_GetLine(self->frame);
-  PyCodeObject *code = self->frame->f_code;
-  return PyUnicode_FromFormat(
-      "<PyInterpreterFrameProxy at %p, file %R, line %d, code %S>",
-      self,
-      code->co_filename,
-      lineno,
-      code->co_name);
-}
-
-static PyGetSetDef PyInterpreterFrameProxy_properties[] = {
-    REGISTER_PROXY_PROPERTY(f_code),
-    REGISTER_PROXY_PROPERTY(f_locals),
-    REGISTER_PROXY_PROPERTY(f_globals),
-    REGISTER_PROXY_PROPERTY(f_builtins),
-    {nullptr} /* Sentinel */
-};
-
-// clang-format off
-static PyTypeObject PyInterpreterFrameProxyType = {
-    PyVarObject_HEAD_INIT(NULL, 0)
-    .tp_name = "paddle.framework.core.PyInterpreterFrameProxy",
-    .tp_doc = PyDoc_STR("A proxy object for _PyInterpreterFrame, "
-                        "it's only define all properties we need."),
-    .tp_repr = reinterpret_cast<reprfunc>(PyInterpreterFrameProxy_method_repr),
-    .tp_basicsize = sizeof(PyInterpreterFrameProxy),
-    .tp_itemsize = 0,
-    .tp_flags = Py_TPFLAGS_DEFAULT,
-    .tp_getset = PyInterpreterFrameProxy_properties,
-};
-// clang-format on
-
-PyInterpreterFrameProxy *PyInterpreterFrameProxy_New(
-    _PyInterpreterFrame *frame) {
-  PyTypeObject *type = &PyInterpreterFrameProxyType;
-  PyInterpreterFrameProxy *self =
-      reinterpret_cast<PyInterpreterFrameProxy *>(type->tp_alloc(type, 0));
-  if (!self) {
-    VLOG(7) << "Failed to allocate PyInterpreterFrameProxy";
-    return nullptr;
-  }
-  self->frame = frame;
-  return self;
-}
-
-// We copy some cpython internal API from cpython project.
-// To avoid name conflict, we use "Internal_" prefix to mark them.
-int Internal_PyInterpreterFrame_GetLine(_PyInterpreterFrame *frame) {
-  int addr = _PyInterpreterFrame_LASTI(frame) * sizeof(_Py_CODEUNIT);
-  return PyCode_Addr2Line(frame->f_code, addr);
-}
-
-static int Internal_PyFrame_OpAlreadyRan(_PyInterpreterFrame *frame,
-                                         int opcode,
-                                         int oparg) {
-  // This only works when opcode is a non-quickened form:
-  assert(_PyOpcode_Deopt[opcode] == opcode);
-  int check_oparg = 0;
-  for (_Py_CODEUNIT *instruction = _PyCode_CODE(frame->f_code);
-       instruction < frame->prev_instr;
-       instruction++) {
-    int check_opcode = _PyOpcode_Deopt[_Py_OPCODE(*instruction)];
-    check_oparg |= _Py_OPARG(*instruction);
-    if (check_opcode == opcode && check_oparg == oparg) {
-      return 1;
-    }
-    if (check_opcode == EXTENDED_ARG) {
-      check_oparg <<= 8;
-    } else {
-      check_oparg = 0;
-    }
-    instruction += _PyOpcode_Caches[check_opcode];
-  }
-  return 0;
-}
-
-int Internal_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame) {
-  /* Merge fast locals into f->f_locals */
-  PyObject *locals;
-  PyObject **fast;
-  PyCodeObject *co;
-  locals = frame->f_locals;
-  if (locals == NULL) {
-    locals = frame->f_locals = PyDict_New();
-    if (locals == NULL) return -1;
-  }
-  co = frame->f_code;
-  fast = _PyFrame_GetLocalsArray(frame);
-  // COPY_FREE_VARS has no quickened forms, so no need to use _PyOpcode_Deopt
-  // here:
-  int lasti = _PyInterpreterFrame_LASTI(frame);
-  if (lasti < 0 && _Py_OPCODE(_PyCode_CODE(co)[0]) == COPY_FREE_VARS) {
-    /* Free vars have not been initialized -- Do that */
-    PyCodeObject *co = frame->f_code;
-    PyObject *closure = frame->f_func->func_closure;
-    int offset = co->co_nlocals + co->co_nplaincellvars;
-    for (int i = 0; i < co->co_nfreevars; ++i) {
-      PyObject *o = PyTuple_GET_ITEM(closure, i);
-      Py_INCREF(o);
-      frame->localsplus[offset + i] = o;
-    }
-    // COPY_FREE_VARS doesn't have inline CACHEs, either:
-    frame->prev_instr = _PyCode_CODE(frame->f_code);
-  }
-  for (int i = 0; i < co->co_nlocalsplus; i++) {
-    _PyLocals_Kind kind = _PyLocals_GetKind(co->co_localspluskinds, i);
-
-    /* If the namespace is unoptimized, then one of the
-       following cases applies:
-       1. It does not contain free variables, because it
-          uses import * or is a top-level namespace.
-       2. It is a class namespace.
-       We don't want to accidentally copy free variables
-       into the locals dict used by the class.
-    */
-    if (kind & CO_FAST_FREE && !(co->co_flags & CO_OPTIMIZED)) {
-      continue;
-    }
-
-    PyObject *name = PyTuple_GET_ITEM(co->co_localsplusnames, i);
-    PyObject *value = fast[i];
-    if (frame->stacktop) {
-      if (kind & CO_FAST_FREE) {
-        // The cell was set by COPY_FREE_VARS.
-        assert(value != NULL && PyCell_Check(value));
-        value = PyCell_GET(value);
-      } else if (kind & CO_FAST_CELL) {
-        // Note that no *_DEREF ops can happen before MAKE_CELL
-        // executes.  So there's no need to duplicate the work
-        // that MAKE_CELL would otherwise do later, if it hasn't
-        // run yet.
-        if (value != NULL) {
-          if (PyCell_Check(value) &&
-              Internal_PyFrame_OpAlreadyRan(frame, MAKE_CELL, i)) {
-            // (likely) MAKE_CELL must have executed already.
-            value = PyCell_GET(value);
-          }
-          // (likely) Otherwise it it is an arg (kind & CO_FAST_LOCAL),
-          // with the initial value set when the frame was created...
-          // (unlikely) ...or it was set to some initial value by
-          // an earlier call to PyFrame_LocalsToFast().
-        }
-      }
-    } else {
-      assert(value == NULL);
-    }
-    if (value == NULL) {
-      if (PyObject_DelItem(locals, name) != 0) {
-        if (PyErr_ExceptionMatches(PyExc_KeyError)) {
-          PyErr_Clear();
-        } else {
-          return -1;
-        }
-      }
-    } else {
-      if (PyObject_SetItem(locals, name, value) != 0) {
-        return -1;
-      }
-    }
-  }
-  return 0;
-}
-
-#else
-typedef PyFrameObject FrameObject;
-#endif
-
-#define unlikely(x) __builtin_expect((x), 0)
-
-// Use static variable to save customed eval hook.
-static Py_tss_t eval_frame_callback_key = {0, 0};
-
-inline static PyObject *eval_frame_callback_get() {
-  void *result = PyThread_tss_get(&eval_frame_callback_key);
-  if (unlikely(result == nullptr)) {
-    Py_RETURN_NONE;
-  } else {
-    return reinterpret_cast<PyObject *>(result);
-  }
-}
-
-inline static void eval_frame_callback_set(PyObject *obj) {
-  PyThread_tss_set(&eval_frame_callback_key, obj);
-}
-
-// call python default eval frame to interpret current frame.
-inline static PyObject *eval_frame_default(PyThreadState *tstate,
-                                           FrameObject *frame,
-                                           int throw_flag) {
-#if PY_VERSION_HEX >= 0x03090000
-  if (tstate == nullptr) {
-    tstate = PyThreadState_GET();
-  }
-  return _PyEval_EvalFrameDefault(tstate, frame, throw_flag);
-#else
-  return _PyEval_EvalFrameDefault(frame, throw_flag);
-#endif
-}
-
-#if PY_VERSION_HEX >= 0x030b0000
-
-inline static PyObject *eval_custom_code_py311_plus(PyThreadState *tstate,
-                                                    FrameObject *frame,
-                                                    PyCodeObject *code,
-                                                    int throw_flag) {
-  // Create a new PyInterpreterFrame. Refer to CALL.
-  // PyInterpreterFrame has a head section calls "specials". It follows
-  // a contiguous section containing localplus and interpreter stack space.
-  size_t size = code->co_nlocalsplus + code->co_stacksize + FRAME_SPECIALS_SIZE;
-  CALL_STAT_INC(frames_pushed);
-  _PyInterpreterFrame *shadow = reinterpret_cast<_PyInterpreterFrame *>(
-      malloc(sizeof(PyObject *) * size));
-  if (shadow == nullptr) {
-    VLOG(7) << "Failed to allocate memory for shadow frame.";
-    return nullptr;
-  }
-  // Create a new function object from code object. Refer to MAKE_FUNCTION.
-  PyFunctionObject *func = reinterpret_cast<PyFunctionObject *>(
-      PyFunction_New(reinterpret_cast<PyObject *>(code), frame->f_globals));
-  _PyFrame_InitializeSpecials(shadow, func, nullptr, code->co_nlocalsplus);
-
-  PyObject **fastlocals_old = frame->localsplus;
-  PyObject **fastlocals_new = shadow->localsplus;
-
-  for (size_t i = 0; i < code->co_nlocalsplus; ++i) {
-    fastlocals_new[i] = nullptr;
-  }
-
-  // The namemap to map the name to index in new frame localsplus.
-  PyObject *namemap = PyDict_New();
-  if (namemap == nullptr) {
-    VLOG(7) << "Failed to create namemap.";
-    free(shadow);
-    return nullptr;
-  }
-  for (size_t i = 0; i < code->co_nlocalsplus; ++i) {
-    PyObject *name = PyTuple_GET_ITEM(code->co_localsplusnames, i);
-    PyObject *index = PyLong_FromSize_t(i);
-    PyDict_SetItem(namemap, name, index);
-  }
-  for (size_t i = 0; i < frame->f_code->co_nlocalsplus; ++i) {
-    PyObject *name = PyTuple_GET_ITEM(frame->f_code->co_localsplusnames, i);
-    PyObject *index = PyDict_GetItem(namemap, name);
-    if (index == nullptr) {
-      continue;
-    }
-    Py_XINCREF(fastlocals_old[i]);
-    fastlocals_new[PyLong_AsSize_t(index)] = fastlocals_old[i];
-  }
-
-  PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
-  free(shadow);
-  Py_DECREF(namemap);
-  return result;
-}
-
-#else
-
-inline static PyObject *eval_custom_code_py310_minus(PyThreadState *tstate,
-                                                     FrameObject *frame,
-                                                     PyCodeObject *code,
-                                                     int throw_flag) {
-  Py_ssize_t ncells = 0;
-  Py_ssize_t nfrees = 0;
-  Py_ssize_t nlocals_new = code->co_nlocals;
-  Py_ssize_t nlocals_old = frame->f_code->co_nlocals;
-
-  ncells = PyTuple_GET_SIZE(code->co_cellvars);
-  nfrees = PyTuple_GET_SIZE(code->co_freevars);
-
-  PyFrameObject *shadow = PyFrame_New(tstate, code, frame->f_globals, nullptr);
-  if (shadow == nullptr) {
-    return nullptr;
-  }
-
-  PyObject **fastlocals_old = frame->f_localsplus;
-  PyObject **fastlocals_new = shadow->f_localsplus;
-
-  for (Py_ssize_t i = 0; i < nlocals_old; i++) {
-    Py_XINCREF(fastlocals_old[i]);
-    fastlocals_new[i] = fastlocals_old[i];
-  }
-
-  for (Py_ssize_t i = 0; i < ncells + nfrees; i++) {
-    Py_XINCREF(fastlocals_old[nlocals_old + i]);
-    fastlocals_new[nlocals_new + i] = fastlocals_old[nlocals_old + i];
-  }
-
-  PyObject *result = eval_frame_default(tstate, shadow, throw_flag);
-  Py_DECREF(shadow);
-  return result;
-}
-
-#endif
-
-// Start a new frame and run code in this frame.
-// Execute a piece of code by default frame-hook.
-inline static PyObject *eval_custom_code(PyThreadState *tstate,
-                                         FrameObject *frame,
-                                         PyCodeObject *code,
-                                         int throw_flag) {
-#if PY_VERSION_HEX >= 0x030b0000
-  return eval_custom_code_py311_plus(tstate, frame, code, throw_flag);
-#else
-  return eval_custom_code_py310_minus(tstate, frame, code, throw_flag);
-#endif
-}
-
-static PyObject *_custom_eval_frame(PyThreadState *tstate,
-                                    FrameObject *frame,
-                                    int throw_flag,
-                                    PyObject *callback) {
-// https://peps.python.org/pep-0558/#fast-locals-proxy-implementation-details
-// https://devguide.python.org/internals/interpreter/#all-sorts-of-variables
-#if PY_VERSION_HEX >= 0x030b0000
-  if (frame->owner == FRAME_OWNED_BY_GENERATOR) {
-    return eval_frame_default(tstate, frame, throw_flag);
-  }
-  // PyFrame_FastToLocalsWithError receives a PyFrameObject, but if we created a
-  // PyFrameObject from a PyInterpreterFrame, it will changes the original
-  // PyInterpreterFrame and causes a Segmentation Fault when Fallback to run
-  // original frame. So we pass a PyInterpreterFrame to
-  // _PyFrame_FastToLocalsWithError directly. But this is an internal API, so we
-  // copy many code from CPython project into our project.
-  if (Internal_PyFrame_FastToLocalsWithError(frame) < 0) {
-#else
-  if (PyFrame_FastToLocalsWithError(frame) < 0) {
-#endif
-    return nullptr;
-  }
-
-  // NOTE:(xiongkun): Handle GeneratorExit exception: (Spend a day)
-  // In Python, gen close is also a Python function call that will enter this
-  // function with GeneratorExit set, which will cause the PyObject_CallObject
-  // raise SystemError. So we disable the custom behavior for GeneratorExit. def
-  // func():
-  //     iter = iter([1, 2, 3])
-  //     for i in iter:
-  //         return i # <--- Early return, cause a GeneratorExit thrown,
-  //                  # <--- which Cause the PyObject_CallObject raise
-  //                  SystemError.
-  if (PyErr_ExceptionMatches(PyExc_GeneratorExit)) {
-    return eval_frame_default(tstate, frame, throw_flag);
-  }
-
-  // We don't run the current custom_eval_frame behavior for guards.
-  // So we temporarily set the callback to Py_None to drive the correct behavior
-  // in the shim.
-  eval_frame_callback_set(Py_None);
-
-#if PY_VERSION_HEX >= 0x030b0000
-  PyObject *args = Py_BuildValue("(O)", PyInterpreterFrameProxy_New(frame));
-#else
-  PyObject *args = Py_BuildValue("(O)", frame);
-#endif
-  PyObject *result = PyObject_CallObject(callback, args);
-  Py_DECREF(args);
-  VLOG(7) << "After call eval_frame_function and decrease frame.";
-  // class CustomCode(Protocal):
-  //     code: CodeType | None
-  //     disable_eval_frame: bool
-  // result: CustomCode
-  if (result == nullptr) {
-    // internal exception
-    VLOG(7) << "Error happened.";
-    return nullptr;
-  } else {
-    //  NOTE: Cache is not supported now
-    PyCodeObject *code = reinterpret_cast<PyCodeObject *>(
-        PyObject_GetAttrString(result, "code"));
-    PyObject *disable_eval_frame =
-        PyObject_GetAttrString(result, "disable_eval_frame");
-    PyObject *out;
-    VLOG(7) << "Start eval new frame and code.";
-    if (disable_eval_frame != Py_True) {
-      // Re-enable custom behavior
-      eval_frame_callback_set(callback);
-      if (reinterpret_cast<PyObject *>(code) != Py_None) {
-        out = eval_custom_code(tstate, frame, code, throw_flag);
-      } else {
-        out = eval_frame_default(tstate, frame, throw_flag);
-      }
-    } else {
-      if (reinterpret_cast<PyObject *>(code) != Py_None) {
-        out = eval_custom_code(tstate, frame, code, throw_flag);
-      } else {
-        out = eval_frame_default(tstate, frame, throw_flag);
-      }
-      // Re-enable custom behavior
-      eval_frame_callback_set(callback);
-    }
-    Py_DECREF(result);
-    Py_DECREF(code);
-    return out;
-  }
-}
-
-static PyObject *_custom_eval_frame_shim(PyThreadState *tstate,
-                                         FrameObject *frame,
-                                         int throw_flag) {
-  PyObject *callback = eval_frame_callback_get();
-
-  if (callback == Py_None) {
-    return eval_frame_default(tstate, frame, throw_flag);
-  }
-
-  return _custom_eval_frame(tstate, frame, throw_flag, callback);
-}
-
-#if PY_VERSION_HEX >= 0x03090000
-static PyObject *custom_eval_frame_shim(PyThreadState *tstate,
-                                        FrameObject *frame,
-                                        int throw_flag) {
-  return _custom_eval_frame_shim(tstate, frame, throw_flag);
-}
-#else
-static PyObject *custom_eval_frame_shim(FrameObject *frame, int throw_flag) {
-  PyThreadState *tstate = PyThreadState_GET();
-  return _custom_eval_frame_shim(tstate, frame, throw_flag);
-}
-#endif
-
-static PyObject *set_eval_frame(PyObject *new_callback, PyThreadState *tstate) {
-  // Change the eval frame callback and return the old one
-  //  - None: disables: disable custom callback.
-  //  - Python callable(): enables custom callback.
-  //  NOTE: Cache is not supported now
-  PyObject *old_callback = eval_frame_callback_get();
-
-#if PY_VERSION_HEX >= 0x03090000
-  auto *old_eval_frame = _PyInterpreterState_GetEvalFrameFunc(tstate->interp);
-#else
-  // Function pointer.
-  _PyFrameEvalFunction old_eval_frame = tstate->interp->eval_frame;
-#endif
-
-  // NOTE: multi-threading is not supported now
-  if (old_callback != Py_None && new_callback == Py_None) {
-    if (old_eval_frame != &_PyEval_EvalFrameDefault) {
-      VLOG(7) << "set _PyEval_EvalFrameDefault";
-#if PY_VERSION_HEX >= 0x03090000
-      _PyInterpreterState_SetEvalFrameFunc(tstate->interp,
-                                           &_PyEval_EvalFrameDefault);
-#else
-      tstate->interp->eval_frame = &_PyEval_EvalFrameDefault;
-#endif
-    }
-  } else if (old_callback == Py_None && new_callback != Py_None) {
-    if (old_eval_frame != &custom_eval_frame_shim) {
-      VLOG(7) << "set custom_eval_frame_shim";
-#if PY_VERSION_HEX >= 0x03090000
-      _PyInterpreterState_SetEvalFrameFunc(tstate->interp,
-                                           &custom_eval_frame_shim);
-#else
-      tstate->interp->eval_frame = &custom_eval_frame_shim;
-#endif
-    }
-  }
-
-  Py_INCREF(new_callback);
-  eval_frame_callback_set(new_callback);
-
-  return old_callback;
-}
-
-static PyObject *set_eval_frame_py(PyObject *callback) {
-  if (callback != Py_None && !PyCallable_Check(callback)) {
-    VLOG(7) << "callback is not a callable or none, invalid arguments.";
-    RETURN_PY_NONE
-  }
-  return set_eval_frame(callback, PyThreadState_GET());
-}
-
-PyMODINIT_FUNC PyInit__eval_frame() {
-  int result = PyThread_tss_create(&eval_frame_callback_key);
-  VLOG(7) << "Set PyThread_tss_create return: " << result;
-
-  Py_INCREF(Py_None);
-  eval_frame_callback_set(Py_None);
-
-  return nullptr;
-}
-
 PyTypeObject *g_jit_function_pytype = nullptr;
 using Variable = paddle::framework::Variable;
 
@@ -620,12 +70,6 @@ void BindEvalFrame(pybind11::module *m) {
         return obj;
       },
       py::arg("callback"));
-#if PY_VERSION_HEX >= 0x030b0000
-  if (PyType_Ready(&PyInterpreterFrameProxyType) < 0) {
-    VLOG(7) << "PyInterpreterFrameProxyType has not been ready!";
-  }
-  Py_INCREF(&PyInterpreterFrameProxyType);
-#endif
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/jit.h b/paddle/fluid/pybind/jit.h
index 2d1a2e08d1e89..0472967ef1907 100644
--- a/paddle/fluid/pybind/jit.h
+++ b/paddle/fluid/pybind/jit.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include <Python.h>
 // Avoid a problem with copysign defined in pyconfig.h on Windows.
 #ifdef copysign
 #undef copysign
@@ -22,97 +21,6 @@ limitations under the License. */
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-// see https://bugs.python.org/issue35886
-// If py_version==3.8.*, we need to redefine _PyEvalFrameFunc and the
-// related functions and structs.
-
-#if PY_VERSION_HEX >= 0x03080000 && PY_VERSION_HEX < 0x3090000
-
-typedef PyObject *(*_PyFrameEvalFunction)(struct _frame *, int);
-
-struct _warnings_runtime_state {
-  /* Both 'filters' and 'onceregistry' can be set in warnings.py;
-     get_warnings_attr() will reset these variables accordingly. */
-  PyObject *filters;        /* List */
-  PyObject *once_registry;  /* Dict */
-  PyObject *default_action; /* String */
-  long filters_version;     // NOLINT
-};
-
-struct _is {
-  struct _is *next;
-  struct _ts *tstate_head;
-
-  int64_t id;
-  int64_t id_refcount;
-  int requires_idref;
-  PyThread_type_lock id_mutex;
-
-  int finalizing;
-
-  PyObject *modules;
-  PyObject *modules_by_index;
-  PyObject *sysdict;
-  PyObject *builtins;
-  PyObject *importlib;
-
-  /* Used in Python/sysmodule.c. */
-  int check_interval;
-
-  /* Used in Modules/_threadmodule.c. */
-  long num_threads;  // NOLINT
-  /* Support for runtime thread stack size tuning.
-     A value of 0 means using the platform's default stack size
-     or the size specified by the THREAD_STACK_SIZE macro. */
-  /* Used in Python/thread.c. */
-  size_t pythread_stacksize;
-
-  PyObject *codec_search_path;
-  PyObject *codec_search_cache;
-  PyObject *codec_error_registry;
-  int codecs_initialized;
-
-  /* fs_codec.encoding is initialized to NULL.
-     Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
-  struct {
-    char *encoding; /* Filesystem encoding (encoded to UTF-8) */
-    char *errors;   /* Filesystem errors (encoded to UTF-8) */
-    _Py_error_handler error_handler;
-  } fs_codec;
-
-  PyConfig config;
-#ifdef HAVE_DLOPEN
-  int dlopenflags;
-#endif
-
-  PyObject *dict; /* Stores per-interpreter state */
-
-  PyObject *builtins_copy;
-  PyObject *import_func;
-  /* Initialized to PyEval_EvalFrameDefault(). */
-  _PyFrameEvalFunction eval_frame;
-
-  Py_ssize_t co_extra_user_count;
-  freefunc co_extra_freefuncs[MAX_CO_EXTRA_USERS];
-
-#ifdef HAVE_FORK
-  PyObject *before_forkers;
-  PyObject *after_forkers_parent;
-  PyObject *after_forkers_child;
-#endif
-  /* AtExit module */
-  void (*pyexitfunc)(PyObject *);
-  PyObject *pyexitmodule;
-
-  uint64_t tstate_next_unique_id;
-
-  struct _warnings_runtime_state warnings;
-
-  PyObject *audit_hooks;
-};
-
-#endif
-
 namespace paddle {
 namespace pybind {
 
diff --git a/test/dygraph_to_static/test_pylayer.py b/test/dygraph_to_static/test_pylayer.py
index ee2d1248e5f63..fca9534c2cd30 100644
--- a/test/dygraph_to_static/test_pylayer.py
+++ b/test/dygraph_to_static/test_pylayer.py
@@ -14,13 +14,19 @@
 
 """Tests for PyLayer of Dynamic-to-Static.
 Only test simple cases here."""
+import sys
+from pathlib import Path
+
+sys.path.append(
+    str(Path(__file__).absolute().parent.parent.joinpath("legacy_test"))
+)
 
 import os
 import tempfile
 import unittest
 
 import numpy as np
-from legacy_test.test_jit_save_load import train
+from test_jit_save_load import train
 
 import paddle
 from paddle.autograd.py_layer import PyLayer

From 80214610d5aa8d9d2bfda2d5653d76cb84d3a27c Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Sat, 23 Sep 2023 13:33:12 +0800
Subject: [PATCH 080/115] [PIR]Add initialized interface of OpResult for Python
 (#57595)

* perfect pybind

* resolve conflict
---
 paddle/fluid/pybind/pir.cc       | 12 ++++++++++++
 test/ir/new_ir/test_ir_pybind.py |  5 +++++
 2 files changed, 17 insertions(+)

diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 56accc98a3fec..ed680cfb58803 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -516,6 +516,10 @@ void BindOpResult(py::module *m) {
         when build network.
   )DOC");
   g_ir_opresult_pytype = reinterpret_cast<PyTypeObject *>(op_result.ptr());
+  op_result.def(
+      "__init__",
+      [](OpResult &self) { new (&self) OpResult(); },
+      pybind11::return_value_policy::reference);
 
   // For basaic operators
   OVERRIDE_OPERATOR_FOR_EACH(__add__, add, 1.0, other, true);
@@ -578,6 +582,14 @@ void BindOpResult(py::module *m) {
                   "persistable"));
             }
           })
+      .def("initialized",
+           [](OpResult &self) {
+             if (self.impl() == nullptr || self.type().storage() == nullptr) {
+               return false;
+             } else {
+               return true;
+             }
+           })
       .def("first_use", &OpResult::first_use, return_value_policy::reference)
       .def("has_one_use", &Value::HasOneUse)
       .def("use_empty", &OpResult::use_empty)
diff --git a/test/ir/new_ir/test_ir_pybind.py b/test/ir/new_ir/test_ir_pybind.py
index 2aaad209d36f7..6434b0eb65268 100644
--- a/test/ir/new_ir/test_ir_pybind.py
+++ b/test/ir/new_ir/test_ir_pybind.py
@@ -130,6 +130,11 @@ def test_value(self):
 
         self.assertEqual(add_op.result(0).use_empty(), True)
 
+        self.assertEqual(add_op.result(0).initialized(), True)
+
+        uninit_op_result = paddle.pir.OpResult()
+        self.assertEqual(uninit_op_result.initialized(), False)
+
     def test_type(self):
         newir_program = get_ir_program()
         matmul_op = newir_program.global_block().ops[1]

From 7fbc45de30ae0aa80fb9ce3019f6ef0d0fb2c8a1 Mon Sep 17 00:00:00 2001
From: Sonder <55493212+AndSonder@users.noreply.github.com>
Date: Sat, 23 Sep 2023 15:40:13 +0800
Subject: [PATCH 081/115] Support control flow for static build [Step 3:
 support while] (#57616)

* add conditional_block to OperatorBasesHandledInStaticBuild

* run op in FakeInitializeOutputsForOperatorBase

* add init_success judge

* fix build error

* fix

* add SetSubBlockCore func

* add PreStaticRun func

* add PreStaticRun to interpreter_base and new_ir_inter

* recover codes

* add PreStaticBuild and BlockCanBeStaticBuilt

* fix logic about RunPreStaticBuild

* change CreateOpFromOpDesc type

* fix build error

* fix build error

* remove IsOperatorBasesHandledInStaticBuild

* recover BlockCanBeStaticBuilt

* add logic about conditional_block run static build

* recover codes

* recover BlockCanBeStaticBuilt

* support static build condational block op when condational block is the last op in the block

* fix error

* fix logic about last op

* fit for sub block can't open static build

* add IsStaticBuild

* fix build error

* fit logic when sub block can't open static build

* close static build when sub_block don't support static_build

* recover third party

* add is_skil_fake_init logic

* set the backend of the lamb

* change start index

* add if conditional for cal is_skip_fake_init

* change name

* close static_build for test_conditional_block

* add static buiild support for conditional block in case of the output's dtype/place is changed but the following op is not use this output

* fix logic error

* fix timeout error

* fix

* remove useless codes

* fix

* fix

* fix build error

* move GetVarsInfo and RunPreStaticBuild from opeartor to static_build

* fix lamb backend registe

* fix build error

* fix build error

* remove lamp op test from new_ir_op_test_white_list

* fix

* move generating following_input_vars logic to static_build.cc

* remove HasInfo

* fix build error

* recover codes and turn off the flag

* add support for while

* fix
---
 .../new_executor/interpreter/static_build.cc  | 216 +++++++++++++++++-
 .../fluid/operators/controlflow/while_op.cc   |  34 ---
 .../operators/controlflow/while_op_helper.cc  |  34 +++
 .../operators/controlflow/while_op_helper.h   |   6 +
 4 files changed, 247 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
index 67b75bb523711..3751ee0a03db4 100644
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -18,15 +18,20 @@
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/controlflow/control_flow_op_helper.h"
+#include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/reader/buffered_reader.h"
+#include "paddle/fluid/platform/flags.h"
 
 #ifdef PADDLE_WITH_DNNL
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
+PHI_DECLARE_bool(cache_inference_while_scope);
+
 // These Ops is OperatorBase, but we have been handle them in static build
-std::set<std::string> OperatorBasesHandledInStaticBuild = {"read",
-                                                           "conditional_block"};
+std::set<std::string> OperatorBasesHandledInStaticBuild = {
+    "read", "conditional_block", "while"};
 
 std::set<std::string> OperatorBasesMustRunInStaticBuild = {
     "create_double_buffer_reader", "create_py_reader"};
@@ -386,9 +391,9 @@ void FakeInitializeTensorBase(const platform::DeviceContext& dev_ctx,
   }
 }
 
-void RunPreStaticBuild(const framework::Scope& scope,
-                       const platform::Place& dev_place,
-                       const OperatorBase& op) {
+void RunConditionalBlockPreStaticBuild(const framework::Scope& scope,
+                                       const platform::Place& dev_place,
+                                       const OperatorBase& op) {
   auto* scope_var = scope.FindVar(op.Output("Scope"));
   PADDLE_ENFORCE_NOT_NULL(
       scope_var,
@@ -434,6 +439,193 @@ void RunPreStaticBuild(const framework::Scope& scope,
   core->Build({}, &op_func_nodes);
 }
 
+void RunWhileBlockPreStaticBuild(const framework::Scope& scope,
+                                 const platform::Place& dev_place,
+                                 const OperatorBase& op) {
+  PADDLE_ENFORCE_NOT_NULL(
+      scope.FindVar(op.Input("Condition")),
+      platform::errors::NotFound("Input(Condition) of WhileOp is not found."));
+
+#ifdef PADDLE_WITH_DNNL
+  // Executor on being destroyed clears oneDNN cache and resets
+  // registered model data layout. This is unwanted for nested
+  // Executors (executors declared inside control ops)
+  platform::DontClearMKLDNNCache(dev_place);
+#endif
+  auto* block = op.Attr<framework::BlockDesc*>("sub_block");
+
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(dev_place);
+
+  bool is_test = op.Attr<bool>("is_test");
+
+  std::set<std::string> no_copy_var_names;
+  if (!is_test) {
+    // set all persistable parameters into no_copy_var_names.
+    auto* global_block = block;
+
+    while (global_block->ID() != 0) global_block = global_block->ParentBlock();
+    auto all_vars = global_block->AllVars();
+    std::for_each(all_vars.begin(),
+                  all_vars.end(),
+                  [&no_copy_var_names](framework::VarDesc* var) {
+                    if (var->IsParameter())
+                      no_copy_var_names.insert(var->Name());
+                  });
+
+    const std::vector<framework::OpDesc*>& all_ops = block->AllOps();
+    for (const framework::OpDesc* item : all_ops) {
+      const framework::VariableNameMap& input_var_names = item->Inputs();
+      const framework::VariableNameMap& output_var_names = item->Outputs();
+      for (auto& ipt : input_var_names) {
+        for (const std::string& var_name : ipt.second) {
+          if (operators::StrInVaraiableNameMap(var_name, output_var_names)) {
+            no_copy_var_names.insert(var_name);
+          }
+        }
+      }
+    }
+  }
+
+  auto step_scopes = scope.FindVar(op.Output("StepScopes"))
+                         ->GetMutable<std::vector<framework::Scope*>>();
+
+  if (!step_scopes->empty()) {
+    platform::DeviceContextPool::Instance().Get(dev_place)->Wait();
+    for (auto& s : *step_scopes) {
+      if (scope.HasKid(s)) {
+        scope.DeleteScope(s);
+      }
+    }
+    step_scopes->clear();
+  }
+
+  PADDLE_ENFORCE_EQ(step_scopes->size(),
+                    0,
+                    platform::errors::PreconditionNotMet(
+                        "The Output(StepScope) of WhileOp should be empty."));
+
+  auto& skip_vars =
+      op.Attr<std::vector<std::string>>("skip_eager_deletion_vars");
+
+  // note(lvyongkang): The assign op in while loop may change the place of
+  // variable. However, InterpreterCore fix the kernel of every ops during its
+  // first run. A cpu tensor may become gpu tensor after first run. This will
+  // lead to segmetation fault when it's used in a cpu kernel. Here we record
+  // the place of every inputs and restore their place after
+  // InterpreterCore.run().
+  std::map<std::string, phi::Place> input_var_original_places;
+  for (const auto& in_name : op.Inputs("X")) {
+    framework::Variable* var = scope.FindVar(in_name);
+    if (var == nullptr) {
+      VLOG(4) << "[while op]"
+              << "input not found:" << in_name;
+    }
+
+    if (var->Type() == framework::proto::VarType::LOD_TENSOR) {
+      input_var_original_places[in_name] =
+          (var->Get<phi::DenseTensor>()).place();
+    } else {
+      VLOG(10) << "[while op]"
+               << "skip backup input " << in_name << " type:"
+               << framework::TransToPhiDataType(
+                      framework::ToVarType(var->Type()));
+    }
+  }
+
+  LOG_FIRST_N(INFO, 1) << "[ControlFlow][WhileOp] New Executor is Running.";
+  std::unique_ptr<InterpreterCore> core;
+
+  framework::Scope placeholder;  // Don't care if it's valid, just for
+                                 // initialize InterpreterCore
+  framework::interpreter::ExecutionConfig execution_config;
+  execution_config.create_local_scope = false;
+  execution_config.used_for_control_flow_op = true;
+  execution_config.skip_gc_vars =
+      std::set<std::string>(skip_vars.begin(), skip_vars.end());
+
+  core.reset(new framework::InterpreterCore(
+      dev_place, *block, &placeholder, execution_config));
+
+  if (!is_test) {
+    auto& current_scope = scope.NewScope();
+    step_scopes->push_back(&current_scope);
+
+    std::vector<std::string> rename_vars;
+    for (const std::string& input_var_name : op.Inputs("X")) {
+      if (no_copy_var_names.find(input_var_name) == no_copy_var_names.end()) {
+        std::string input_var_rename = input_var_name + "@TMP_COPY";
+        framework::Variable* input_var = scope.FindVar(input_var_name);
+        if (input_var->IsType<phi::DenseTensor>()) {
+          rename_vars.push_back(input_var_rename);
+          auto input_var_tensor = input_var->Get<phi::DenseTensor>();
+          auto* rename_input_var_tensor = current_scope.Var(input_var_rename)
+                                              ->GetMutable<phi::DenseTensor>();
+          framework::TensorCopy(
+              input_var_tensor, dev_place, rename_input_var_tensor);
+          rename_input_var_tensor->set_lod(input_var_tensor.lod());
+        }
+      }
+    }
+
+    operators::BuildScopeForControlFlowOp(*core, *block, &current_scope);
+    core->reset_scope(&current_scope);
+
+    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+    core->Build({}, &op_func_nodes);
+
+    // restore inputs place
+    for (const auto& n : input_var_original_places) {
+      const std::string& in_name = n.first;
+      const phi::Place& original_place = n.second;
+      // input vars exist in `scope` not `current_scope`
+      operators::TransferVariablePlace(
+          &scope, in_name, original_place, dev_ctx);
+    }
+
+    for (auto& var_rename : rename_vars) {
+      std::string input_var_name =
+          var_rename.substr(0, var_rename.size() - strlen("@TMP_COPY"));
+      current_scope.Rename(var_rename, input_var_name);
+    }
+  } else {
+    framework::Scope* current_scope = nullptr;
+    if (!FLAGS_cache_inference_while_scope) {
+      current_scope = &(scope.NewScope());
+      operators::BuildScopeForControlFlowOp(*core, *block, current_scope);
+      core->reset_scope(current_scope);
+    } else {
+      auto cached_inference_scope = &(scope.NewScope());
+      operators::BuildScopeForControlFlowOp(
+          *core, *block, cached_inference_scope);
+      core->reset_scope(cached_inference_scope);
+      current_scope = cached_inference_scope;
+    }
+
+    for (auto& name : current_scope->LocalVarNames()) {
+      auto* var = current_scope->Var(name);
+      if (var->IsType<phi::DenseTensor>()) {
+        // Clear all lod information for all lod_tensors.
+        auto* t = var->GetMutable<phi::DenseTensor>();
+        framework::LoD empty_lod;
+        t->set_lod(empty_lod);
+      } else if (var->IsType<framework::LoDTensorArray>()) {
+        // Clear elements of all tensor arrays.
+        auto* t = var->GetMutable<framework::LoDTensorArray>();
+        t->clear();
+      }
+    }
+
+    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
+    core->Build({}, &op_func_nodes);
+
+    if (!FLAGS_cache_inference_while_scope) {
+      scope.DeleteScope(current_scope);
+    }
+  }
+}
+
 void FakeInitializeOutputsForOperatorBase(
     const OperatorBase& op,
     const phi::Place& place,
@@ -447,7 +639,7 @@ void FakeInitializeOutputsForOperatorBase(
   phi::DeviceContext* dev_ctx =
       platform::DeviceContextPool::Instance().Get(place);
 
-  if (op_type == "conditional_block") {
+  if (op_type == "conditional_block" || op_type == "while") {
     // Note(sonder): skip fake init for conditional_block when there is no
     // op with kernel after it.
     bool skip_fake_init = true;
@@ -456,7 +648,7 @@ void FakeInitializeOutputsForOperatorBase(
     for (size_t i = 0; i < following_ops.size(); ++i) {
       if (dynamic_cast<framework::OperatorWithKernel*>(
               following_ops[i].get()) != nullptr) {
-        VLOG(4) << "Find op with kernel after conditional_block : "
+        VLOG(4) << "Find op with kernel after " << op_type << ": "
                 << following_ops[i]->Type();
         skip_fake_init = false;
         auto input_vars_info = GetVarsInfo(
@@ -474,7 +666,12 @@ void FakeInitializeOutputsForOperatorBase(
     const std::vector<VarMetaInfo> out_var_info_before_build =
         GetVarsInfo(scope, op.Outputs(), op);
 
-    RunPreStaticBuild(*scope, place, op);
+    if (op_type == "conditional_block") {
+      RunConditionalBlockPreStaticBuild(*scope, place, op);
+    } else {
+      RunWhileBlockPreStaticBuild(*scope, place, op);
+    }
+
     const std::vector<VarMetaInfo> out_var_info_after_build =
         GetVarsInfo(scope, op.Outputs(), op);
 
@@ -487,10 +684,11 @@ void FakeInitializeOutputsForOperatorBase(
         auto var_name = out_var_info_before_build[i].name_;
         if (following_input_vars.count(var_name)) {
           PADDLE_THROW(phi::errors::PreconditionNotMet(
-              "The output %s s' dtype/place of conditional_block is "
+              "The output %s s' dtype/place of %s is "
               "changed after static build. Befer static build, the "
               "dtype is %s, place is %s. After static "
               "build, the dtype is %s, place is %s.",
+              op_type,
               var_name,
               out_var_info_before_build[i].dtype_,
               out_var_info_before_build[i].place_,
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 8cf7dc24c1d3f..45f24931c6f99 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -51,40 +51,6 @@ static std::string GetSkipEagerDeletionVarsDebugString(
   return str;
 }
 
-static void TransferVariablePlace(const framework::Scope *scope,
-                                  const std::string &var_name,
-                                  const phi::Place &dst_place,
-                                  const platform::DeviceContext &dev_ctx) {
-  framework::Variable *var = scope->FindVar(var_name);
-  if (var == nullptr) {
-    VLOG(4) << "[TransferVariablePlace]"
-            << "lost in_var: " << var_name;
-    return;
-  }
-  if (var->Type() != framework::proto::VarType::LOD_TENSOR) {
-    VLOG(10) << "[TransferVariablePlace]" << var_name << " type changed:"
-             << framework::TransToPhiDataType(
-                    framework::ToVarType(var->Type()));
-    return;
-  }
-  phi::DenseTensor *t = var->GetMutable<phi::DenseTensor>();
-  if (t->place() == dst_place) {
-    VLOG(10) << "[TransferVariablePlace]"
-             << "no need transfer: " << var_name;
-    return;
-  }
-
-  phi::DenseTensor *new_t = new phi::DenseTensor;
-  framework::TensorCopy(*t, dst_place, new_t);
-  dev_ctx.Wait();
-
-  t->set_meta(new_t->meta());
-  t->ResetHolder(new_t->Holder());
-
-  VLOG(4) << "[TransferVariablePlace]" << var_name
-          << " place: " << new_t->place();
-}
-
 }  // namespace
 
 class WhileOp : public framework::OperatorBase {
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 6627ba0482b26..8ddce0da7faac 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -250,5 +250,39 @@ bool StrInVaraiableNameMap(const std::string &name,
   return false;
 }
 
+void TransferVariablePlace(const framework::Scope *scope,
+                           const std::string &var_name,
+                           const phi::Place &dst_place,
+                           const platform::DeviceContext &dev_ctx) {
+  framework::Variable *var = scope->FindVar(var_name);
+  if (var == nullptr) {
+    VLOG(4) << "[TransferVariablePlace]"
+            << "lost in_var: " << var_name;
+    return;
+  }
+  if (var->Type() != framework::proto::VarType::LOD_TENSOR) {
+    VLOG(10) << "[TransferVariablePlace]" << var_name << " type changed:"
+             << framework::TransToPhiDataType(
+                    framework::ToVarType(var->Type()));
+    return;
+  }
+  phi::DenseTensor *t = var->GetMutable<phi::DenseTensor>();
+  if (t->place() == dst_place) {
+    VLOG(10) << "[TransferVariablePlace]"
+             << "no need transfer: " << var_name;
+    return;
+  }
+
+  phi::DenseTensor *new_t = new phi::DenseTensor;
+  framework::TensorCopy(*t, dst_place, new_t);
+  dev_ctx.Wait();
+
+  t->set_meta(new_t->meta());
+  t->ResetHolder(new_t->Holder());
+
+  VLOG(4) << "[TransferVariablePlace]" << var_name
+          << " place: " << new_t->place();
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index 926b207832e07..7aa4b6418b6bc 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -19,6 +19,7 @@
 #include <vector>
 
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/controlflow/op_variant.h"
 
 namespace phi {
@@ -58,5 +59,10 @@ bool GetCondData(const phi::DenseTensor &cond);
 bool StrInVaraiableNameMap(const std::string &,
                            const framework::VariableNameMap &);
 
+void TransferVariablePlace(const framework::Scope *scope,
+                           const std::string &var_name,
+                           const phi::Place &dst_place,
+                           const platform::DeviceContext &dev_ctx);
+
 }  // namespace operators
 }  // namespace paddle

From 9269321940b80f2dafc42c4e2c3f8bd40dd890b8 Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Sat, 23 Sep 2023 16:29:58 +0800
Subject: [PATCH 082/115] [PIR] rectify yield input from opresult to value.
 (#57635)

---
 paddle/pir/dialect/control_flow/ir/cf_ops.cc               | 4 ++--
 paddle/pir/dialect/control_flow/ir/cf_ops.h                | 2 +-
 test/cpp/new_executor/standalone_executor_new_ir_test.cc   | 4 ++--
 test/cpp/pir/cinn/group_op_test.cc                         | 4 ++--
 test/cpp/pir/control_flow_dialect/if_op_test.cc            | 4 ++--
 test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/paddle/pir/dialect/control_flow/ir/cf_ops.cc b/paddle/pir/dialect/control_flow/ir/cf_ops.cc
index 6147ab8ee40e4..7981a6ab96396 100644
--- a/paddle/pir/dialect/control_flow/ir/cf_ops.cc
+++ b/paddle/pir/dialect/control_flow/ir/cf_ops.cc
@@ -18,8 +18,8 @@ namespace pir {
 
 void YieldOp::Build(Builder &builder,
                     OperationArgument &argument,
-                    std::vector<OpResult> &&inputs) {
-  argument.AddInputs(inputs.begin(), inputs.end());
+                    const std::vector<Value> &inputs) {
+  argument.AddInputs(inputs);
 }
 }  // namespace pir
 
diff --git a/paddle/pir/dialect/control_flow/ir/cf_ops.h b/paddle/pir/dialect/control_flow/ir/cf_ops.h
index 2f69aa9147224..3689920e1bce6 100644
--- a/paddle/pir/dialect/control_flow/ir/cf_ops.h
+++ b/paddle/pir/dialect/control_flow/ir/cf_ops.h
@@ -27,7 +27,7 @@ class IR_API YieldOp : public Op<YieldOp> {
 
   static void Build(Builder &builder,             // NOLINT
                     OperationArgument &argument,  // NOLINT
-                    std::vector<OpResult> &&inputs);
+                    const std::vector<Value> &Value);
   void Verify() {}
 };
 }  // namespace pir
diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
index eac996ffebe0f..02ca49d180baa 100644
--- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc
+++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
@@ -242,7 +242,7 @@ TEST(StandaloneExecutor, if_op) {
 
   auto full_op_1 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2}, true, phi::DataType::BOOL);
-  builder.Build<pir::YieldOp>(std::vector<pir::OpResult>{full_op_1.out()});
+  builder.Build<pir::YieldOp>(std::vector<pir::Value>{full_op_1.out()});
 
   pir::Block* false_block = if_op.false_block();
 
@@ -250,7 +250,7 @@ TEST(StandaloneExecutor, if_op) {
 
   auto full_op_2 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{3}, true, phi::DataType::BOOL);
-  builder.Build<pir::YieldOp>(std::vector<pir::OpResult>{full_op_2.out()});
+  builder.Build<pir::YieldOp>(std::vector<pir::Value>{full_op_2.out()});
 
   auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program);
 
diff --git a/test/cpp/pir/cinn/group_op_test.cc b/test/cpp/pir/cinn/group_op_test.cc
index bc1d65b51ac01..a5bd90a54f0f0 100644
--- a/test/cpp/pir/cinn/group_op_test.cc
+++ b/test/cpp/pir/cinn/group_op_test.cc
@@ -55,7 +55,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgram() {
   builder.SetInsertionPointToEnd(block1);
   auto full_op_x = builder.Build<paddle::dialect::FullOp>(
       shape, value_one, phi::DataType::FLOAT32, phi::GPUPlace());
-  builder.Build<::pir::YieldOp>(std::vector<::pir::OpResult>{full_op_x.out()});
+  builder.Build<::pir::YieldOp>(std::vector<::pir::Value>{full_op_x.out()});
 
   builder.SetInsertionPointToEnd(program->block());
   auto group_op2 = builder.Build<cinn::dialect::GroupOp>(
@@ -67,7 +67,7 @@ std::shared_ptr<::pir::Program> BuildGroupProgram() {
   auto relu_op_x = builder.Build<paddle::dialect::ReluOp>(tan_op_x->result(0));
   auto tan_op_y = builder.Build<paddle::dialect::TanOp>(relu_op_x->result(0));
   auto relu_op_y = builder.Build<paddle::dialect::ReluOp>(tan_op_y->result(0));
-  builder.Build<::pir::YieldOp>(std::vector<::pir::OpResult>{relu_op_y.out()});
+  builder.Build<::pir::YieldOp>(std::vector<::pir::Value>{relu_op_y.out()});
   return program;
 }
 
diff --git a/test/cpp/pir/control_flow_dialect/if_op_test.cc b/test/cpp/pir/control_flow_dialect/if_op_test.cc
index f4a7a7790866d..f2e49b150b7bc 100644
--- a/test/cpp/pir/control_flow_dialect/if_op_test.cc
+++ b/test/cpp/pir/control_flow_dialect/if_op_test.cc
@@ -44,7 +44,7 @@ TEST(if_op_test, base) {
 
   auto full_op_1 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2}, true, phi::DataType::BOOL);
-  builder.Build<pir::YieldOp>(std::vector<pir::OpResult>{full_op_1.out()});
+  builder.Build<pir::YieldOp>(std::vector<pir::Value>{full_op_1.out()});
 
   pir::Block* false_block = if_op.false_block();
 
@@ -52,7 +52,7 @@ TEST(if_op_test, base) {
 
   auto full_op_2 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{3}, true, phi::DataType::BOOL);
-  builder.Build<pir::YieldOp>(std::vector<pir::OpResult>{full_op_2.out()});
+  builder.Build<pir::YieldOp>(std::vector<pir::Value>{full_op_2.out()});
 
   std::stringstream ss;
   program.Print(ss);
diff --git a/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
index 52773cc96e928..1c0eaebe0a909 100644
--- a/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
+++ b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
@@ -195,7 +195,7 @@ TEST(kernel_dialect, cond_op_test) {
 
   auto full_op_1 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2}, true, phi::DataType::BOOL);
-  builder.Build<pir::YieldOp>(std::vector<pir::OpResult>{full_op_1.out()});
+  builder.Build<pir::YieldOp>(std::vector<pir::Value>{full_op_1.out()});
 
   pir::Block* false_block = if_op.false_block();
 
@@ -203,7 +203,7 @@ TEST(kernel_dialect, cond_op_test) {
 
   auto full_op_2 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{3}, true, phi::DataType::BOOL);
-  builder.Build<pir::YieldOp>(std::vector<pir::OpResult>{full_op_2.out()});
+  builder.Build<pir::YieldOp>(std::vector<pir::Value>{full_op_2.out()});
 
   program.Print(std::cout);
   auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program);

From 832309bd67904d4d8b47e9f5f91f56d9dd2b2472 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Sat, 23 Sep 2023 16:40:49 +0800
Subject: [PATCH 083/115] [SOT] clean code in eval frame and fix dy2st uts
 (#57662)

---
 paddle/fluid/pybind/eval_frame.c              |  5 +
 paddle/fluid/pybind/eval_frame.h              | 91 -------------------
 test/cpp/pir/tools/CMakeLists.txt             |  2 +-
 .../test_new_ir_selectedrows.py               | 13 +--
 4 files changed, 13 insertions(+), 98 deletions(-)

diff --git a/paddle/fluid/pybind/eval_frame.c b/paddle/fluid/pybind/eval_frame.c
index 0254e1dce1f0a..3030caa761470 100644
--- a/paddle/fluid/pybind/eval_frame.c
+++ b/paddle/fluid/pybind/eval_frame.c
@@ -17,6 +17,11 @@ limitations under the License. */
 #include <Python.h>
 #include <frameobject.h>
 
+#if PY_VERSION_HEX >= 0x03080000 && PY_VERSION_HEX < 0x3090000
+#define Py_BUILD_CORE  // internal/pycore_pymem.h need this macro
+#include <internal/pycore_pystate.h>
+#undef Py_BUILD_CORE
+#endif
 #if PY_VERSION_HEX < 0x030b0000
 #include <code.h>
 #endif
diff --git a/paddle/fluid/pybind/eval_frame.h b/paddle/fluid/pybind/eval_frame.h
index 383d2d7a5bc00..803cdb7088f7b 100644
--- a/paddle/fluid/pybind/eval_frame.h
+++ b/paddle/fluid/pybind/eval_frame.h
@@ -19,97 +19,6 @@ extern "C" {
 
 #include <Python.h>
 
-// see https://bugs.python.org/issue35886
-// If py_version==3.8.*, we need to redefine _PyEvalFrameFunc and the
-// related functions and structs.
-
-#if PY_VERSION_HEX >= 0x03080000 && PY_VERSION_HEX < 0x3090000
-
-typedef PyObject *(*_PyFrameEvalFunction)(struct _frame *, int);
-
-struct _warnings_runtime_state {
-  /* Both 'filters' and 'onceregistry' can be set in warnings.py;
-     get_warnings_attr() will reset these variables accordingly. */
-  PyObject *filters;        /* List */
-  PyObject *once_registry;  /* Dict */
-  PyObject *default_action; /* String */
-  long filters_version;     // NOLINT
-};
-
-struct _is {
-  struct _is *next;
-  struct _ts *tstate_head;
-
-  int64_t id;
-  int64_t id_refcount;
-  int requires_idref;
-  PyThread_type_lock id_mutex;
-
-  int finalizing;
-
-  PyObject *modules;
-  PyObject *modules_by_index;
-  PyObject *sysdict;
-  PyObject *builtins;
-  PyObject *importlib;
-
-  /* Used in Python/sysmodule.c. */
-  int check_interval;
-
-  /* Used in Modules/_threadmodule.c. */
-  long num_threads;  // NOLINT
-  /* Support for runtime thread stack size tuning.
-     A value of 0 means using the platform's default stack size
-     or the size specified by the THREAD_STACK_SIZE macro. */
-  /* Used in Python/thread.c. */
-  size_t pythread_stacksize;
-
-  PyObject *codec_search_path;
-  PyObject *codec_search_cache;
-  PyObject *codec_error_registry;
-  int codecs_initialized;
-
-  /* fs_codec.encoding is initialized to NULL.
-     Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
-  struct {
-    char *encoding; /* Filesystem encoding (encoded to UTF-8) */
-    char *errors;   /* Filesystem errors (encoded to UTF-8) */
-    _Py_error_handler error_handler;
-  } fs_codec;
-
-  PyConfig config;
-#ifdef HAVE_DLOPEN
-  int dlopenflags;
-#endif
-
-  PyObject *dict; /* Stores per-interpreter state */
-
-  PyObject *builtins_copy;
-  PyObject *import_func;
-  /* Initialized to PyEval_EvalFrameDefault(). */
-  _PyFrameEvalFunction eval_frame;
-
-  Py_ssize_t co_extra_user_count;
-  freefunc co_extra_freefuncs[MAX_CO_EXTRA_USERS];
-
-#ifdef HAVE_FORK
-  PyObject *before_forkers;
-  PyObject *after_forkers_parent;
-  PyObject *after_forkers_child;
-#endif
-  /* AtExit module */
-  void (*pyexitfunc)(PyObject *);
-  PyObject *pyexitmodule;
-
-  uint64_t tstate_next_unique_id;
-
-  struct _warnings_runtime_state warnings;
-
-  PyObject *audit_hooks;
-};
-
-#endif
-
 PyObject *set_eval_frame_py(PyObject *callback);
 PyMODINIT_FUNC PyInit__eval_frame();
 
diff --git a/test/cpp/pir/tools/CMakeLists.txt b/test/cpp/pir/tools/CMakeLists.txt
index 5a1f073698833..24273327e22fd 100644
--- a/test/cpp/pir/tools/CMakeLists.txt
+++ b/test/cpp/pir/tools/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   test_dialect
   SRCS test_dialect.cc test_op.cc test_trait.cc test_interface.cc
-  DEPS pir)
+  DEPS pir gtest)
diff --git a/test/dygraph_to_static/test_new_ir_selectedrows.py b/test/dygraph_to_static/test_new_ir_selectedrows.py
index 13563d73b1753..7d87a48fe7858 100644
--- a/test/dygraph_to_static/test_new_ir_selectedrows.py
+++ b/test/dygraph_to_static/test_new_ir_selectedrows.py
@@ -15,7 +15,10 @@
 import random
 import unittest
 
-from dygraph_to_static_util import test_and_compare_with_new_ir
+from dygraph_to_static_util import (
+    enable_fallback_guard,
+    test_and_compare_with_new_ir,
+)
 
 import paddle
 from paddle.jit.api import to_static
@@ -53,7 +56,6 @@ def forward(self, x):
         return x
 
 
-@to_static
 def train(net, adam, x):
     loss_data = []
     for i in range(10):
@@ -75,7 +77,6 @@ def train_dygraph():
         parameters=net.parameters(), learning_rate=0.01, grad_clip=clip
     )
 
-    paddle.jit.enable_to_static(False)
     return train(net, adam, x)
 
 
@@ -89,8 +90,7 @@ def train_static():
         parameters=net.parameters(), learning_rate=0.01, grad_clip=clip
     )
 
-    paddle.jit.enable_to_static(True)
-    return train(net, adam, x)
+    return to_static(train)(net, adam, x)
 
 
 class TestSimnet(unittest.TestCase):
@@ -104,4 +104,5 @@ def test_dygraph_static_same_loss(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    with enable_fallback_guard("False"):
+        unittest.main()

From 19a8f0aa263a8d0595f7e328077cc2f48eca547f Mon Sep 17 00:00:00 2001
From: umiswing <umiswing@foxmail.com>
Date: Sun, 24 Sep 2023 11:03:32 +0800
Subject: [PATCH 084/115] Additional mask support on FA2 (#57276)

* Add addition mask support. Tested on FlashAttnKernel.

* Fix bug in fwd (temporarily).
Add masked support on bwd.
Unpadded kernel to be tested.

* Add unscale on padded kernel.

* Add varlen mask.

* Remove redundant compute_scale_q

* Remove redundant comment.
Fix ci: PADDLE_ENFORCE format.
Remove test case: return_softmax && dropout==0

* Add mask type check.

* Update submodules.
---
 .../phi/kernels/gpu/flash_attn_grad_kernel.cu | 441 +++++------------
 paddle/phi/kernels/gpu/flash_attn_kernel.cu   | 456 +++++-------------
 paddle/phi/kernels/gpu/flash_attn_utils.h     |  87 ++--
 test/legacy_test/test_flash_attention.py      |   4 +-
 third_party/flashattn                         |   2 +-
 5 files changed, 305 insertions(+), 685 deletions(-)

diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
index fae308008b460..0296c2afee76f 100644
--- a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
@@ -33,155 +33,6 @@ int get_num_split() {
   return FLAGS_cudnn_deterministic ? 1 : 0;
 }
 
-template <typename T, typename Context>
-void FlashAttnUnpaddedGradImpl(const Context& ctx,
-                               const DenseTensor& q,
-                               const DenseTensor& k,
-                               const DenseTensor& v,
-                               const DenseTensor& cu_seqlens_q,
-                               const DenseTensor& cu_seqlens_k,
-                               const DenseTensor& out,
-                               const DenseTensor& softmax_lse,
-                               const DenseTensor& seed_offset,
-                               const paddle::optional<DenseTensor>& attn_mask,
-                               const DenseTensor& dout,
-                               int64_t max_seqlen_q,
-                               int64_t max_seqlen_k,
-                               float scale,
-                               float dropout,
-                               bool causal,
-                               DenseTensor* dq,
-                               DenseTensor* dk,
-                               DenseTensor* dv) {
-#ifdef PADDLE_WITH_FLASHATTN
-  const cudaStream_t stream = ctx.stream();
-
-  auto dims = q.dims();
-  int64_t total_q = dims[0];
-  int64_t num_heads = dims[1];
-  int64_t head_size = dims[2];
-
-  int64_t total_k = k.dims()[0];
-  int64_t batch_size = cu_seqlens_q.numel() - 1;
-
-  PADDLE_ENFORCE_NE(causal,
-                    true,
-                    phi::errors::InvalidArgument(
-                        "attn_mask is not nullptr, causal can not be true"));
-
-  PADDLE_ENFORCE_EQ(
-      head_size == 32 || head_size == 64 || head_size == 128,
-      true,
-      phi::errors::InvalidArgument("The head_dim is expected to be either 32, "
-                                   "64, or 128, but recieved %d.",
-                                   head_size));
-  const int64_t* seed_offset_data = seed_offset.data<int64_t>();
-  uint64_t seed = static_cast<uint64_t>(seed_offset_data[0]);
-  uint64_t offset = static_cast<uint64_t>(seed_offset_data[1]);
-  VLOG(10) << "FlashAttn bwd seed: " << seed << ", offset: " << offset;
-
-  int64_t seqlen_q = ((max_seqlen_q + 16 - 1) / 16) * 16;
-  DenseTensor dsoftmax = Empty<float>(ctx, {batch_size, num_heads, seqlen_q});
-
-  const DenseTensor* attn_mask_tensor = attn_mask.get_ptr();
-  std::vector<int64_t> mask_dims = GetAttnMaskDims(attn_mask_tensor);
-
-  int fa_num_splits = 0;
-  bool fa_is_bf16 = q.dtype() == DataType::BFLOAT16;
-  float fa_with_mask_scale = 1.0f;
-  bool fa_zero_tensors = false;
-
-  uint64_t workspace_size;
-
-  int64_t q_size = total_q * num_heads * head_size;
-  DenseTensor scaled_q = Empty<T>(ctx, {total_q, num_heads, head_size});
-  ComputeScaleQ(ctx, q_size, scale, q.data<T>(), scaled_q.data<T>());
-
-  bool succ = phi::dynload::flash_attn_bwd_with_bias_and_mask(
-      static_cast<const void*>(scaled_q.data<T>()),
-      static_cast<const void*>(k.data()),
-      static_cast<const void*>(v.data()),
-      static_cast<void*>(dq->data()),
-      static_cast<void*>(dk->data()),
-      static_cast<void*>(dv->data()),
-      nullptr,  // set out to nullptr to calculate workspace size
-      dout.data(),
-      static_cast<const int32_t*>(cu_seqlens_q.data()),
-      static_cast<const int32_t*>(cu_seqlens_k.data()),
-      total_q,
-      total_k,
-      batch_size,
-      num_heads,
-      head_size,
-      max_seqlen_q,
-      max_seqlen_k,
-      dropout,
-      fa_with_mask_scale,
-      fa_zero_tensors,
-      fa_is_bf16,
-      fa_num_splits,
-      static_cast<const void*>(softmax_lse.data()),
-      static_cast<void*>(dsoftmax.data()),
-      nullptr,
-      nullptr,
-      &workspace_size,
-      stream,
-      seed,
-      offset,
-      attn_mask_tensor ? attn_mask_tensor->data() : nullptr,
-      nullptr,
-      mask_dims.data() ? mask_dims.data() : nullptr,
-      nullptr);
-  CheckFlashAttnStatus(succ);
-  DenseTensor workspace;
-  if (workspace_size > 0) {
-    workspace = Empty<float>(
-        ctx, {static_cast<int64_t>(workspace_size / sizeof(float))});
-  }
-
-  succ = phi::dynload::flash_attn_bwd_with_bias_and_mask(
-      static_cast<const void*>(scaled_q.data<T>()),
-      static_cast<const void*>(k.data()),
-      static_cast<const void*>(v.data()),
-      static_cast<void*>(dq->data()),
-      static_cast<void*>(dk->data()),
-      static_cast<void*>(dv->data()),
-      out.data(),  // set out to nullptr to calculate workspace size
-      dout.data(),
-      static_cast<const int32_t*>(cu_seqlens_q.data()),
-      static_cast<const int32_t*>(cu_seqlens_k.data()),
-      total_q,
-      total_k,
-      batch_size,
-      num_heads,
-      head_size,
-      max_seqlen_q,
-      max_seqlen_k,
-      dropout,
-      fa_with_mask_scale,
-      fa_zero_tensors,
-      fa_is_bf16,
-      fa_num_splits,
-      static_cast<const void*>(softmax_lse.data()),
-      static_cast<void*>(dsoftmax.data()),
-      nullptr,
-      workspace_size > 0 ? workspace.data() : nullptr,
-      &workspace_size,
-      stream,
-      seed,
-      offset,
-      attn_mask_tensor ? attn_mask_tensor->data() : nullptr,
-      nullptr,
-      mask_dims.data() ? mask_dims.data() : nullptr,
-      nullptr);
-  CheckFlashAttnStatus(succ);
-
-  ComputeScaleQ(ctx, q_size, scale, dq->data<T>(), dq->data<T>());
-#else
-  RaiseNotSupportedError();
-#endif
-}
-
 template <typename T, typename Context>
 void FlashAttnUnpaddedGradKernel(const Context& ctx,
                                  const DenseTensor& q,
@@ -212,95 +63,77 @@ void FlashAttnUnpaddedGradKernel(const Context& ctx,
   // q,k,v [total_*, num_heads, head_dim]
   auto dims = q.dims();
 
-  if (attn_mask.get_ptr()) {
-    FlashAttnUnpaddedGradImpl<T, Context>(ctx,
-                                          q,
-                                          k,
-                                          v,
-                                          cu_seqlens_q,
-                                          cu_seqlens_k,
-                                          out,
-                                          softmax_lse,
-                                          seed_offset,
-                                          attn_mask,
-                                          dout,
-                                          max_seqlen_q,
-                                          max_seqlen_k,
-                                          scale,
-                                          dropout,
-                                          causal,
-                                          dq,
-                                          dk,
-                                          dv);
-  } else {
-    const int64_t total_q = dims[0];
-    const int64_t batch_size = cu_seqlens_q.numel() - 1;
-    const int64_t num_heads = dims[1];
-    const int64_t head_size_og = dout.dims()[2];
-    const int64_t head_size = dims[2];
-    const int64_t total_k = k.dims()[0];
-    const int64_t num_heads_k = k.dims()[1];
+  const int64_t total_q = dims[0];
+  const int64_t batch_size = cu_seqlens_q.numel() - 1;
+  const int64_t num_heads = dims[1];
+  const int64_t head_size_og = dout.dims()[2];
+  const int64_t head_size = dims[2];
+  const int64_t total_k = k.dims()[0];
+  const int64_t num_heads_k = k.dims()[1];
 
-    int num_splits = get_num_split();
+  int num_splits = get_num_split();
 
-    // TODO(umiswing): add shape check
-    PADDLE_ENFORCE_EQ(
-        head_size_og,
-        head_size,
-        phi::errors::InvalidArgument(
-            "flash_attn_bwd receive input with head_size_og == head_size"));
-
-    FlashAttnBwdParamsV2 params =
-        FlashAttnBwdParamsV2(ctx,
-                             batch_size,
-                             max_seqlen_q,
-                             max_seqlen_k,
-                             num_heads,
-                             num_heads_k,
-                             head_size,
-                             dropout,
-                             scale,
-                             causal,
-                             q.dtype(),
-                             seed_offset.data<int64_t>());
-
-    VLOG(10) << "FlashAttn bwd seed: " << params.seed
-             << ", offset: " << params.offset;
+  // TODO(umiswing): add shape check
+  PADDLE_ENFORCE_EQ(
+      head_size_og,
+      head_size,
+      phi::errors::InvalidArgument(
+          "flash_attn_bwd receive input with head_size_og == head_size"));
 
-    bool succ =
-        phi::dynload::flash_attn_varlen_bwd(dout.data(),
-                                            q.data(),
-                                            k.data(),
-                                            v.data(),
-                                            out.data(),
-                                            params.softmax_d.data(),
-                                            softmax_lse.data(),
-                                            cu_seqlens_q.data<int32_t>(),
-                                            cu_seqlens_k.data<int32_t>(),
-                                            params.rng_state.data(),
-                                            dq->data(),
-                                            dk->data(),
-                                            dv->data(),
-                                            params.dq_accum.data(),
-                                            params.batch_size,
-                                            params.max_seqlen_q,
-                                            params.max_seqlen_k,
-                                            params.seqlen_q_rounded,
-                                            params.seqlen_k_rounded,
-                                            params.num_heads,
-                                            params.num_heads_k,
-                                            params.head_size,
-                                            params.head_size_rounded,
-                                            params.dropout,
-                                            params.scale,
-                                            params.causal,
-                                            params.is_bf16,
-                                            num_splits,
-                                            stream,
-                                            params.seed,
-                                            params.offset);
-    CheckFlashAttnStatus(succ);
-  }
+  FlashAttnBwdParamsV2 params =
+      FlashAttnBwdParamsV2(ctx,
+                           batch_size,
+                           max_seqlen_q,
+                           max_seqlen_k,
+                           num_heads,
+                           num_heads_k,
+                           head_size,
+                           dropout,
+                           scale,
+                           causal,
+                           q.dtype(),
+                           attn_mask,
+                           seed_offset.data<int64_t>());
+
+  VLOG(10) << "FlashAttn bwd seed: " << params.seed
+           << ", offset: " << params.offset;
+
+  bool succ = phi::dynload::flash_attn_varlen_bwd(
+      dout.data(),
+      q.data(),
+      k.data(),
+      v.data(),
+      out.data(),
+      params.softmax_d.data(),
+      softmax_lse.data(),
+      cu_seqlens_q.data<int32_t>(),
+      cu_seqlens_k.data<int32_t>(),
+      params.rng_state.data(),
+      dq->data(),
+      dk->data(),
+      dv->data(),
+      params.dq_accum.data(),
+      params.batch_size,
+      params.max_seqlen_q,
+      params.max_seqlen_k,
+      params.seqlen_q_rounded,
+      params.seqlen_k_rounded,
+      params.num_heads,
+      params.num_heads_k,
+      params.head_size,
+      params.head_size_rounded,
+      params.dropout,
+      params.scale,
+      1.0f / params.scale,
+      params.causal,
+      params.is_bf16,
+      num_splits,
+      stream,
+      params.seed,
+      params.offset,
+      params.attn_mask_tensor ? params.attn_mask_tensor->data() : nullptr,
+      params.mask_dims.data());
+  CheckFlashAttnStatus(succ);
 #else
   RaiseNotSupportedError();
 #endif
@@ -347,95 +180,67 @@ void FlashAttnGradKernel(const Context& ctx,
            << "], v[" << v.dims() << "]";
 
   const float scale = 1.0f / std::sqrt(head_size);
-  if (attn_mask.get_ptr()) {
-    DenseTensor q_t_s, k_t_s, v_t_s;
-    q_t_s.ShareDataWith(q).Resize({total_q, num_heads, head_size});
-    k_t_s.ShareDataWith(k).Resize({total_k, num_heads, head_size});
-    v_t_s.ShareDataWith(v).Resize({total_k, num_heads, head_size});
-
-    DenseTensor cu_seqlens_q;
-    DenseTensor cu_seqlens_k;
-    ArangeNullaryKernel<int32_t, Context>(
-        ctx, 0, (batch_size + 1) * seqlen_q, seqlen_q, &cu_seqlens_q);
-    ArangeNullaryKernel<int32_t, Context>(
-        ctx, 0, (batch_size + 1) * seqlen_k, seqlen_k, &cu_seqlens_k);
 
-    FlashAttnUnpaddedGradKernel<T, Context>(ctx,
-                                            q_t_s,
-                                            k_t_s,
-                                            v_t_s,
-                                            cu_seqlens_q,
-                                            cu_seqlens_k,
-                                            out,
-                                            softmax_lse,
-                                            seed_offset,
-                                            attn_mask,
-                                            dout,
-                                            seqlen_q,
-                                            seqlen_k,
-                                            scale,
-                                            dropout,
-                                            causal,
-                                            dq,
-                                            dk,
-                                            dv);
-  } else {
-    FlashAttnBwdParamsV2 params =
-        FlashAttnBwdParamsV2(ctx,
-                             batch_size,
-                             seqlen_q,
-                             seqlen_k,
-                             num_heads,
-                             num_heads_k,
-                             head_size,
-                             dropout,
-                             scale,
-                             causal,
-                             q.dtype(),
-                             seed_offset.data<int64_t>());
+  FlashAttnBwdParamsV2 params =
+      FlashAttnBwdParamsV2(ctx,
+                           batch_size,
+                           seqlen_q,
+                           seqlen_k,
+                           num_heads,
+                           num_heads_k,
+                           head_size,
+                           dropout,
+                           scale,
+                           causal,
+                           q.dtype(),
+                           attn_mask,
+                           seed_offset.data<int64_t>());
 
-    ctx.template Alloc<T>(dq);
-    ctx.template Alloc<T>(dk);
-    ctx.template Alloc<T>(dv);
+  ctx.template Alloc<T>(dq);
+  ctx.template Alloc<T>(dk);
+  ctx.template Alloc<T>(dv);
 
-    cudaStream_t stream = ctx.stream();
+  cudaStream_t stream = ctx.stream();
 
-    VLOG(10) << "FlashAttn bwd seed: " << params.seed
-             << ", offset: " << params.offset;
+  VLOG(10) << "FlashAttn bwd seed: " << params.seed
+           << ", offset: " << params.offset;
 
-    int num_splits = get_num_split();
+  int num_splits = get_num_split();
 
-    bool succ = phi::dynload::flash_attn_bwd(dout.data(),
-                                             q.data(),
-                                             k.data(),
-                                             v.data(),
-                                             out.data(),
-                                             params.softmax_d.data(),
-                                             softmax_lse.data(),
-                                             params.rng_state.data(),
-                                             dq->data(),
-                                             dk->data(),
-                                             dv->data(),
-                                             params.dq_accum.data(),
-                                             params.batch_size,
-                                             params.max_seqlen_q,
-                                             params.max_seqlen_k,
-                                             params.seqlen_q_rounded,
-                                             params.seqlen_k_rounded,
-                                             params.num_heads,
-                                             params.num_heads_k,
-                                             params.head_size,
-                                             params.head_size_rounded,
-                                             params.dropout,
-                                             params.scale,
-                                             params.causal,
-                                             params.is_bf16,
-                                             num_splits,
-                                             stream,
-                                             params.seed,
-                                             params.offset);
-    CheckFlashAttnStatus(succ);
-  }
+  bool succ = phi::dynload::flash_attn_bwd(
+      dout.data(),
+      q.data(),
+      k.data(),
+      v.data(),
+      out.data(),
+      params.softmax_d.data(),
+      softmax_lse.data(),
+      params.rng_state.data(),
+      dq->data(),
+      dk->data(),
+      dv->data(),
+      params.dq_accum.data(),
+      params.batch_size,
+      params.max_seqlen_q,
+      params.max_seqlen_k,
+      params.seqlen_q_rounded,
+      params.seqlen_k_rounded,
+      params.num_heads,
+      params.num_heads_k,
+      params.head_size,
+      params.head_size_rounded,
+      params.dropout,
+      params.scale,
+      std::sqrt(head_size),  // for unscale
+      params.causal,
+      params.is_bf16,
+      num_splits,
+      stream,
+      params.seed,
+      params.offset,
+      params.attn_mask_tensor ? params.attn_mask_tensor->data() : nullptr,
+      params.mask_dims.data());
+  CheckFlashAttnStatus(succ);
 #else
   RaiseNotSupportedError();
 #endif
diff --git a/paddle/phi/kernels/gpu/flash_attn_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
index 4bd5e28c09fed..aadae0f29c342 100644
--- a/paddle/phi/kernels/gpu/flash_attn_kernel.cu
+++ b/paddle/phi/kernels/gpu/flash_attn_kernel.cu
@@ -28,169 +28,6 @@ PD_DECLARE_bool(cudnn_deterministic);
 
 namespace phi {
 
-template <typename T, typename Context>
-void FlashAttnWithMaskUnpaddedImpl(
-    const Context& ctx,
-    const DenseTensor& q,
-    const DenseTensor& k,
-    const DenseTensor& v,
-    const DenseTensor& cu_seqlens_q,
-    const DenseTensor& cu_seqlens_k,
-    const paddle::optional<DenseTensor>& fixed_seed_offset,
-    const paddle::optional<DenseTensor>& attn_mask,
-    int64_t max_seqlen_q,
-    int64_t max_seqlen_k,
-    float scale,
-    float dropout,
-    bool causal,
-    bool return_softmax,
-    bool is_test,
-    const std::string& rng_name,
-    DenseTensor* out,
-    DenseTensor* softmax,
-    DenseTensor* softmax_lse,
-    DenseTensor* seed_offset) {
-#ifdef PADDLE_WITH_FLASHATTN
-  cudaStream_t stream = ctx.stream();
-
-  auto dims = q.dims();
-  int64_t total_q = dims[0];
-  int64_t num_heads = dims[1];
-  int64_t head_size = dims[2];
-
-  int64_t total_k = k.dims()[0];
-  int64_t batch_size = cu_seqlens_q.numel() - 1;
-
-  PADDLE_ENFORCE_NE(causal,
-                    true,
-                    phi::errors::InvalidArgument(
-                        "attn_mask is not nullptr, causal can not be true"));
-
-  PADDLE_ENFORCE_EQ(
-      head_size == 32 || head_size == 64 || head_size == 128,
-      true,
-      phi::errors::InvalidArgument("The head_dim is expected to be either 32, "
-                                   "64, or 128, but recieved %d.",
-                                   head_size));
-
-  // Generate random state for dropout and save for recompute in grad.
-  auto seed_offset_pair =
-      GenerateRNGState(ctx, fixed_seed_offset, rng_name, batch_size, num_heads);
-  uint64_t seed = seed_offset_pair.first;
-  uint64_t offset = seed_offset_pair.second;
-
-  VLOG(10) << "FlashAttn fwd seed: " << seed << ", offset: " << offset;
-
-  seed_offset->Resize({2});
-  int64_t* seed_offset_data = ctx.template HostAlloc<int64_t>(seed_offset);
-  seed_offset_data[0] = static_cast<int64_t>(seed);
-  seed_offset_data[1] = static_cast<int64_t>(offset);
-
-  // Allocate memory for softmax_lse and softmax.
-  int64_t seqlen_q = ((max_seqlen_q + 16 - 1) / 16) * 16;
-
-  softmax_lse->Resize({batch_size, num_heads, seqlen_q});
-  ctx.template Alloc<float>(softmax_lse);
-
-  if (return_softmax) {
-    // may allocate more space than *max_seqlen_k*
-    int64_t blocksize_c = head_size > 64 ? 128 : 256;
-    int64_t seqlen_k =
-        ((max_seqlen_k + blocksize_c - 1) / blocksize_c) * blocksize_c;
-    if (max_seqlen_k <= 128) {
-      seqlen_k = 128;
-    } else if (max_seqlen_k <= 256) {
-      seqlen_k = 256;
-    }
-    softmax->Resize({batch_size, num_heads, seqlen_q, seqlen_k});
-    ctx.template Alloc<T>(softmax);
-  }
-
-  // Compute scale Q
-  int64_t q_size = total_q * num_heads * head_size;
-  DenseTensor scaled_q = Empty<T>(ctx, {total_q, num_heads, head_size});
-  ComputeScaleQ(ctx, q_size, scale, q.data<T>(), scaled_q.data<T>());
-
-  const DenseTensor* attn_mask_tensor = attn_mask.get_ptr();
-  std::vector<int64_t> mask_dims = GetAttnMaskDims(attn_mask_tensor);
-
-  int fa_num_splits = 0;
-  bool fa_is_bf16 = q.dtype() == DataType::BFLOAT16;
-  float fa_with_mask_scale = 1.0f;
-  bool fa_zero_tensors = false;
-
-  uint64_t workspace_size = 0;
-  bool succ = phi::dynload::flash_attn_fwd_with_bias_and_mask(
-      static_cast<const void*>(scaled_q.data()),
-      static_cast<const void*>(k.data()),
-      static_cast<const void*>(v.data()),
-      nullptr,  // for calculation workspace size
-      static_cast<const int32_t*>(cu_seqlens_q.data()),
-      static_cast<const int32_t*>(cu_seqlens_k.data()),
-      total_q,
-      total_k,
-      batch_size,
-      num_heads,
-      head_size,
-      max_seqlen_q,
-      max_seqlen_k,
-      dropout,
-      fa_with_mask_scale,
-      fa_zero_tensors,
-      fa_is_bf16,
-      fa_num_splits,
-      softmax_lse->data(),
-      nullptr,
-      &workspace_size,
-      stream,
-      seed,
-      offset,
-      attn_mask_tensor ? attn_mask_tensor->data() : nullptr,
-      nullptr,
-      mask_dims.data() ? mask_dims.data() : nullptr,
-      nullptr);
-  CheckFlashAttnStatus(succ);
-
-  DenseTensor workspace;
-  if (workspace_size > 0) {
-    workspace = Empty<float>(
-        ctx, {static_cast<int64_t>(workspace_size / sizeof(float))});
-  }
-  succ = phi::dynload::flash_attn_fwd_with_bias_and_mask(
-      static_cast<const void*>(scaled_q.data()),
-      k.data(),
-      v.data(),
-      out->data(),  // set out to nullptr to calculate workspace size
-      static_cast<const int32_t*>(cu_seqlens_q.data()),
-      static_cast<const int32_t*>(cu_seqlens_k.data()),
-      total_q,
-      total_k,
-      batch_size,
-      num_heads,
-      head_size,
-      max_seqlen_q,
-      max_seqlen_k,
-      dropout,
-      fa_with_mask_scale,
-      fa_zero_tensors,
-      fa_is_bf16,
-      fa_num_splits,
-      softmax_lse->data(),
-      workspace_size > 0 ? workspace.data() : nullptr,
-      &workspace_size,
-      stream,
-      seed,
-      offset,
-      attn_mask_tensor ? attn_mask_tensor->data() : nullptr,
-      nullptr,
-      mask_dims.data() ? mask_dims.data() : nullptr,
-      nullptr);
-  CheckFlashAttnStatus(succ);
-#else
-  RaiseNotSupportedError();
-#endif
-}
-
 template <typename T, typename Context>
 void FlashAttnUnpaddedKernel(
     const Context& ctx,
@@ -226,95 +63,70 @@ void FlashAttnUnpaddedKernel(
       phi::errors::InvalidArgument("flash_attn_raw receive input with dim "
                                    "[total_seq_len, num_heads, head_dim]"));
 
-  if (attn_mask.get_ptr()) {
-    FlashAttnWithMaskUnpaddedImpl<T, Context>(ctx,
-                                              q,
-                                              k,
-                                              v,
-                                              cu_seqlens_q,
-                                              cu_seqlens_k,
-                                              fixed_seed_offset,
-                                              attn_mask,
-                                              max_seqlen_q,
-                                              max_seqlen_k,
-                                              scale,
-                                              dropout,
-                                              causal,
-                                              return_softmax,
-                                              is_test,
-                                              rng_name,
-                                              out,
-                                              softmax,
-                                              softmax_lse,
-                                              seed_offset);
-  } else {
-    const int64_t total_q = dims[0];
-    const int64_t num_heads = dims[1];
-    const int64_t head_size = dims[2];
-
-    const int64_t total_k = k.dims()[0];
-    const int64_t num_heads_k = k.dims()[1];
-    const int64_t batch_size = cu_seqlens_q.numel() - 1;
-
-    // TODO(umiswing): add deterministic in fa2.
-    // int num_splits = 0;  // 0 for an internal heuristic, which is optimal
-    // if (FLAGS_cudnn_deterministic) {
-    //   num_splits = 1;
-    // }
-
-    // TODO(umiswing): add shape check
-
-    FlashAttnFwdParamsV2<T> params = FlashAttnFwdParamsV2<T>(ctx,
-                                                             batch_size,
-                                                             max_seqlen_q,
-                                                             max_seqlen_k,
-                                                             num_heads,
-                                                             num_heads_k,
-                                                             head_size,
-                                                             dropout,
-                                                             scale,
-                                                             causal,
-                                                             return_softmax,
-                                                             q.dtype(),
-                                                             is_test,
-                                                             rng_name,
-                                                             fixed_seed_offset,
-                                                             softmax,
-                                                             softmax_lse,
-                                                             seed_offset);
-
-    VLOG(10) << "FlashAttn fwd seed: " << params.seed
-             << ", offset: " << params.offset;
-
-    bool succ = phi::dynload::flash_attn_varlen_fwd(
-        q.data(),
-        k.data(),
-        v.data(),
-        cu_seqlens_q.data<int32_t>(),
-        cu_seqlens_k.data<int32_t>(),
-        params.rng_state.data(),
-        out->data(),
-        params.return_softmax ? softmax->data() : nullptr,
-        softmax_lse->data(),
-        params.batch_size,
-        params.max_seqlen_q,
-        params.max_seqlen_k,
-        params.seqlen_q_rounded,
-        params.seqlen_k_rounded,
-        params.num_heads,
-        params.num_heads_k,
-        params.head_size,
-        params.head_size_rounded,
-        params.dropout,
-        params.scale,
-        params.causal,
-        params.return_softmax,
-        params.is_bf16,
-        stream,
-        params.seed,
-        params.offset);
-    CheckFlashAttnStatus(succ);
-  }
+  const int64_t total_q = dims[0];
+  const int64_t num_heads = dims[1];
+  const int64_t head_size = dims[2];
+
+  const int64_t total_k = k.dims()[0];
+  const int64_t num_heads_k = k.dims()[1];
+  const int64_t batch_size = cu_seqlens_q.numel() - 1;
+
+  // TODO(umiswing): add shape check
+
+  FlashAttnFwdParamsV2<T> params = FlashAttnFwdParamsV2<T>(ctx,
+                                                           batch_size,
+                                                           max_seqlen_q,
+                                                           max_seqlen_k,
+                                                           num_heads,
+                                                           num_heads_k,
+                                                           head_size,
+                                                           dropout,
+                                                           scale,
+                                                           causal,
+                                                           return_softmax,
+                                                           q.dtype(),
+                                                           is_test,
+                                                           rng_name,
+                                                           fixed_seed_offset,
+                                                           attn_mask,
+                                                           softmax,
+                                                           softmax_lse,
+                                                           seed_offset);
+
+  VLOG(10) << "FlashAttn fwd seed: " << params.seed
+           << ", offset: " << params.offset;
+
+  bool succ = phi::dynload::flash_attn_varlen_fwd(
+      q.data(),
+      k.data(),
+      v.data(),
+      cu_seqlens_q.data<int32_t>(),
+      cu_seqlens_k.data<int32_t>(),
+      params.rng_state.data(),
+      out->data(),
+      params.return_softmax ? softmax->data() : nullptr,
+      softmax_lse->data(),
+      params.batch_size,
+      params.max_seqlen_q,
+      params.max_seqlen_k,
+      params.seqlen_q_rounded,
+      params.seqlen_k_rounded,
+      params.num_heads,
+      params.num_heads_k,
+      params.head_size,
+      params.head_size_rounded,
+      params.dropout,
+      params.scale,
+      1.0f / params.scale,
+      params.causal,
+      params.return_softmax,
+      params.is_bf16,
+      stream,
+      params.seed,
+      params.offset,
+      params.attn_mask_tensor ? params.attn_mask_tensor->data() : nullptr,
+      params.mask_dims.data());
+  CheckFlashAttnStatus(succ);
 #else
   RaiseNotSupportedError();
 #endif
@@ -359,94 +171,64 @@ void FlashAttnKernel(const Context& ctx,
 
   const float scale = 1.0f / std::sqrt(head_size);
 
-  if (attn_mask.get_ptr()) {
-    DenseTensor q_t_s, k_t_s, v_t_s;
-    q_t_s.ShareDataWith(q).Resize({total_q, num_heads, head_size});
-    k_t_s.ShareDataWith(k).Resize({total_k, num_heads, head_size});
-    v_t_s.ShareDataWith(v).Resize({total_k, num_heads, head_size});
-
-    DenseTensor cu_seqlens_q;
-    DenseTensor cu_seqlens_k;
-    ArangeNullaryKernel<int32_t, Context>(
-        ctx, 0, (batch_size + 1) * seqlen_q, seqlen_q, &cu_seqlens_q);
-    ArangeNullaryKernel<int32_t, Context>(
-        ctx, 0, (batch_size + 1) * seqlen_k, seqlen_k, &cu_seqlens_k);
-
-    FlashAttnUnpaddedKernel<T, Context>(ctx,
-                                        q_t_s,
-                                        k_t_s,
-                                        v_t_s,
-                                        cu_seqlens_q,
-                                        cu_seqlens_k,
-                                        fixed_seed_offset,
-                                        attn_mask,
-                                        seqlen_q,
-                                        seqlen_k,
-                                        scale,
-                                        dropout,
-                                        causal,
-                                        return_softmax,
-                                        is_test,
-                                        rng_name,
-                                        out,
-                                        softmax,
-                                        softmax_lse,
-                                        seed_offset);
-  } else {
-    FlashAttnFwdParamsV2<T> params = FlashAttnFwdParamsV2<T>(ctx,
-                                                             batch_size,
-                                                             seqlen_q,
-                                                             seqlen_k,
-                                                             num_heads,
-                                                             num_heads_k,
-                                                             head_size,
-                                                             dropout,
-                                                             scale,
-                                                             causal,
-                                                             return_softmax,
-                                                             q.dtype(),
-                                                             is_test,
-                                                             rng_name,
-                                                             fixed_seed_offset,
-                                                             softmax,
-                                                             softmax_lse,
-                                                             seed_offset);
+  FlashAttnFwdParamsV2<T> params = FlashAttnFwdParamsV2<T>(ctx,
+                                                           batch_size,
+                                                           seqlen_q,
+                                                           seqlen_k,
+                                                           num_heads,
+                                                           num_heads_k,
+                                                           head_size,
+                                                           dropout,
+                                                           scale,
+                                                           causal,
+                                                           return_softmax,
+                                                           q.dtype(),
+                                                           is_test,
+                                                           rng_name,
+                                                           fixed_seed_offset,
+                                                           attn_mask,
+                                                           softmax,
+                                                           softmax_lse,
+                                                           seed_offset);
+
+  VLOG(10) << "FlashAttn fwd dims: q[" << q.dims() << "], k[" << k.dims()
+           << "], v[" << v.dims() << "]";
+  VLOG(10) << "FlashAttn fwd seed: " << params.seed
+           << ", offset: " << params.offset;
 
-    VLOG(10) << "FlashAttn fwd dims: q[" << q.dims() << "], k[" << k.dims()
-             << "], v[" << v.dims() << "]";
-    VLOG(10) << "FlashAttn fwd seed: " << params.seed
-             << ", offset: " << params.offset;
+  ctx.template Alloc<T>(out);
 
-    ctx.template Alloc<T>(out);
+  cudaStream_t stream = ctx.stream();
 
-    cudaStream_t stream = ctx.stream();
-    bool succ = phi::dynload::flash_attn_fwd(
-        q.data(),
-        k.data(),
-        v.data(),
-        params.rng_state.data(),
-        out->data(),
-        params.return_softmax ? params.softmax->data() : nullptr,
-        params.softmax_lse->data(),
-        params.batch_size,
-        params.max_seqlen_q,
-        params.max_seqlen_k,
-        params.seqlen_q_rounded,
-        params.seqlen_k_rounded,
-        params.num_heads,
-        params.num_heads_k,
-        params.head_size,
-        params.head_size_rounded,
-        params.dropout,
-        params.scale,
-        params.causal,
-        params.return_softmax,
-        params.is_bf16,
-        stream,
-        params.seed,
-        params.offset);
-    CheckFlashAttnStatus(succ);
-  }
+  bool succ = phi::dynload::flash_attn_fwd(
+      q.data(),
+      k.data(),
+      v.data(),
+      params.rng_state.data(),
+      out->data(),
+      params.return_softmax ? params.softmax->data() : nullptr,
+      params.softmax_lse->data(),
+      params.batch_size,
+      params.max_seqlen_q,
+      params.max_seqlen_k,
+      params.seqlen_q_rounded,
+      params.seqlen_k_rounded,
+      params.num_heads,
+      params.num_heads_k,
+      params.head_size,
+      params.head_size_rounded,
+      params.dropout,
+      params.scale,
+      std::sqrt(head_size),  // for unscale
+      params.causal,
+      params.return_softmax,
+      params.is_bf16,
+      stream,
+      params.seed,
+      params.offset,
+      params.attn_mask_tensor ? params.attn_mask_tensor->data() : nullptr,
+      params.mask_dims.data());
+  CheckFlashAttnStatus(succ);
 #else
   RaiseNotSupportedError();
 #endif
diff --git a/paddle/phi/kernels/gpu/flash_attn_utils.h b/paddle/phi/kernels/gpu/flash_attn_utils.h
index 03601dae16db1..ea438014f4312 100644
--- a/paddle/phi/kernels/gpu/flash_attn_utils.h
+++ b/paddle/phi/kernels/gpu/flash_attn_utils.h
@@ -51,6 +51,32 @@ static std::pair<uint64_t, uint64_t> GenerateRNGState(
   }
 }
 
+static std::vector<int64_t> GetAttnMaskDims(const DenseTensor* attn_mask) {
+  std::vector<int64_t> mask_dim_4d;
+  if (attn_mask) {
+    const auto& origin_dims = attn_mask->dims();
+    auto rank = origin_dims.size();
+    PADDLE_ENFORCE_GE(
+        rank,
+        4,
+        phi::errors::InvalidArgument(
+            "The number of dimenstions of attn_mask is expected to be greater "
+            "or equal to 4, but recieved %d. The shape of attn_mask is {%s}",
+            rank,
+            origin_dims));
+
+    int64_t first_dim = 1;
+    for (int i = 0; i < rank - 3; i++) {
+      first_dim *= origin_dims[i];
+    }
+    mask_dim_4d = {first_dim,
+                   origin_dims[rank - 3],
+                   origin_dims[rank - 2],
+                   origin_dims[rank - 1]};
+  }
+  return mask_dim_4d;
+}
+
 template <typename T>
 struct FlashAttnFwdParamsV2 {
   int batch_size;
@@ -71,7 +97,9 @@ struct FlashAttnFwdParamsV2 {
   bool is_bf16;
   uint64_t seed;
   uint64_t offset;
+  std::vector<int64_t> mask_dims;
   DenseTensor rng_state;
+  const DenseTensor* attn_mask_tensor;
   DenseTensor* softmax;
   DenseTensor* softmax_lse;
   DenseTensor* seed_offset;
@@ -91,6 +119,7 @@ struct FlashAttnFwdParamsV2 {
                        const bool is_test,
                        const std::string& rng_name,
                        const paddle::optional<DenseTensor>& fixed_seed_offset,
+                       const paddle::optional<DenseTensor>& attn_mask,
                        DenseTensor* _softmax,
                        DenseTensor* _softmax_lse,
                        DenseTensor* _seed_offset)
@@ -106,7 +135,8 @@ struct FlashAttnFwdParamsV2 {
         return_softmax(_return_softmax),
         softmax(_softmax),
         softmax_lse(_softmax_lse),
-        seed_offset(_seed_offset) {
+        seed_offset(_seed_offset),
+        attn_mask_tensor(attn_mask.get_ptr()) {
     dropout = is_test ? 0.0f : _dropout;
     is_bf16 = q_dtype == DataType::BFLOAT16;
 
@@ -133,10 +163,25 @@ struct FlashAttnFwdParamsV2 {
     ctx.template Alloc<float>(softmax_lse);
 
     if (return_softmax) {
+      PADDLE_ENFORCE_EQ(
+          dropout > 0.0f,
+          true,
+          phi::errors::InvalidArgument(
+              "return_softmax is only supported when dropout > 0.0"));
+
       softmax->Resize(
           {batch_size, num_heads, seqlen_q_rounded, seqlen_k_rounded});
       ctx.template Alloc<T>(softmax);
     }
+
+    mask_dims = GetAttnMaskDims(attn_mask_tensor);
+    if (attn_mask) {
+      PADDLE_ENFORCE_EQ(
+          attn_mask->dtype(),
+          q_dtype,
+          phi::errors::InvalidArgument(
+              "attn_mask is expected to have the same data type with q."));
+    }
   }
 };
 
@@ -156,9 +201,11 @@ struct FlashAttnBwdParamsV2 {
   bool is_bf16;
   uint64_t seed;
   uint64_t offset;
+  std::vector<int64_t> mask_dims;
   DenseTensor softmax_d;
   DenseTensor dq_accum;
   DenseTensor rng_state;
+  const DenseTensor* attn_mask_tensor;
 
   FlashAttnBwdParamsV2(const GPUContext& ctx,
                        const int _batch_size,
@@ -171,6 +218,7 @@ struct FlashAttnBwdParamsV2 {
                        const float _scale,
                        const bool _causal,
                        const DataType q_dtype,
+                       const paddle::optional<DenseTensor>& attn_mask,
                        const int64_t* seed_offset_data)
       : batch_size(_batch_size),
         max_seqlen_q(_max_seqlen_q),
@@ -180,7 +228,8 @@ struct FlashAttnBwdParamsV2 {
         head_size(_head_size),
         dropout(_dropout),
         scale(_scale),
-        causal(_causal) {
+        causal(_causal),
+        attn_mask_tensor(attn_mask.get_ptr()) {
     is_bf16 = q_dtype == DataType::BFLOAT16;
     seed = static_cast<uint64_t>(seed_offset_data[0]);
     offset = static_cast<uint64_t>(seed_offset_data[1]);
@@ -198,6 +247,15 @@ struct FlashAttnBwdParamsV2 {
     softmax_d = Empty<float>(ctx, {batch_size, num_heads, seqlen_q_rounded});
     dq_accum = Empty<float>(
         ctx, {batch_size, num_heads, seqlen_q_rounded, head_size_rounded});
+
+    mask_dims = GetAttnMaskDims(attn_mask_tensor);
+    if (attn_mask) {
+      PADDLE_ENFORCE_EQ(
+          attn_mask->dtype(),
+          q_dtype,
+          phi::errors::InvalidArgument(
+              "attn_mask is expected to have the same data type with q."));
+    }
   }
 };
 
@@ -229,31 +287,6 @@ void ComputeScaleQ(
                      ctx.stream()>>>(input, numel, scale, output);
 }
 
-static std::vector<int64_t> GetAttnMaskDims(const DenseTensor* attn_mask) {
-  std::vector<int64_t> mask_dim_4d;
-  if (attn_mask) {
-    const auto& origin_dims = attn_mask->dims();
-    auto rank = origin_dims.size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        4,
-        phi::errors::InvalidArgument(
-            "The number of dimenstions of attn_mask is expected to be greater "
-            "or equal to 4, but recieved %d. The shape of attn_mask is {%s}",
-            rank,
-            origin_dims));
-
-    int64_t first_dim = 1;
-    for (int i = 0; i < rank - 3; i++) {
-      first_dim *= origin_dims[i];
-    }
-    mask_dim_4d = {first_dim,
-                   origin_dims[rank - 3],
-                   origin_dims[rank - 2],
-                   origin_dims[rank - 1]};
-  }
-  return mask_dim_4d;
-}
 #endif
 
 static void RaiseNotSupportedError() {
diff --git a/test/legacy_test/test_flash_attention.py b/test/legacy_test/test_flash_attention.py
index 245e51a36a0d0..d78f4cf575a6d 100644
--- a/test/legacy_test/test_flash_attention.py
+++ b/test/legacy_test/test_flash_attention.py
@@ -384,7 +384,7 @@ def setUp(self):
         self.dtype = paddle.float16
         self.dropout = 0.0
         self.causal = False
-        self.return_softmax = True
+        self.return_softmax = False
         self.use_sdp_kernel = False
 
 
@@ -451,7 +451,7 @@ def setUp(self):
         self.enable_mem_efficient = False
 
 
-class TestFlashAttrnionWithMaskAPI(TestFlashAttentionWithMaskAPI):
+class TestFlashAttenionWithMaskAPITest(TestFlashAttentionWithMaskAPI):
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (8, 1024, 16, 128)
diff --git a/third_party/flashattn b/third_party/flashattn
index e6b9d0d48c29f..b74460b385b69 160000
--- a/third_party/flashattn
+++ b/third_party/flashattn
@@ -1 +1 @@
-Subproject commit e6b9d0d48c29f8205b440dede6a48ceb8394383f
+Subproject commit b74460b385b691d881ff2d3a1adbcefdcac574a3

From 980546f78ad31a4fdf1f583b9cd250b04884e532 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Sun, 24 Sep 2023 23:01:03 +0800
Subject: [PATCH 085/115] checked added ut in cpu (#57674)

---
 paddle/scripts/paddle_build.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 91c3ab0c93129..c72f20a24ff53 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -687,7 +687,6 @@ EOF
             echo "Unittests with nightly labels  are only run at night"
             echo "========================================="
         fi
-        bash $PADDLE_ROOT/tools/check_added_ut.sh
         check_approvals_of_unittest 2
         # serial_list: Some single tests need to reduce concurrency
         single_list="^test_cdist$|^test_resnet$|^test_resnet_v2$|^test_concat_op$|^test_transformer$|^test_bert_with_stride$|^test_paddle_save_load$"
@@ -797,7 +796,6 @@ function run_linux_cpu_test() {
 EOF
 set -x
         export TEST_NUM_PERCENT_CASES=0.15
-        bash $PADDLE_ROOT/tools/check_added_ut.sh
         if [ -a "$PADDLE_ROOT/duplicate_ut" ];then
             duplicate_uts=$(cat $PADDLE_ROOT/duplicate_ut|sed -e 's/\r//g')
             if [[ "$duplicate_uts" != "" ]];then
@@ -2741,7 +2739,10 @@ function enable_unused_var_check() {
     # Currently, use it in coverage CI job.
     export FLAGS_enable_unused_var_check=1
 }
-
+function check_coverage_added_ut() {
+    # NOTE(risemeup1):The steps of checkinge added test can be placed on the cpu machine to save gpu resources
+    bash $PADDLE_ROOT/tools/check_added_ut.sh
+}
 function gen_doc_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -3988,6 +3989,7 @@ function main() {
         check_diff_file_for_coverage
         run_setup ${PYTHON_ABI:-""} bdist_wheel ${parallel_number}
         enable_unused_var_check
+        check_coverage_added_ut
         check_coverage_build
         ;;
       gpu_cicheck_coverage)

From 2072e3026173269f057bfe0455c0a9a183d92060 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Mon, 25 Sep 2023 09:29:10 +0800
Subject: [PATCH 086/115] add gather_nd (#57640)

* add gather_nd

* Update paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py

* Update test/legacy_test/test_gather_nd_op.py
---
 .../op_generator/vjp_interface_gen_op_list.py |  8 ++++--
 paddle/fluid/primitive/codegen/gen.py         |  8 ++++++
 test/legacy_test/test_gather_nd_op.py         | 28 +++++++++----------
 3 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
index 62dd904dc7bf6..b00c67aa06b24 100644
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
@@ -55,7 +55,9 @@
     'fused_softmax_mask_upper_triangle',
     'slice',
     'transpose',
-    'slice_double',
+    'slice_grad',
+    'gather_nd',
+    'stack',
     'poisson',
     'gumbel_softmax',
     'tril',
@@ -93,7 +95,9 @@
     'fused_softmax_mask_upper_triangle',
     'slice',
     'transpose',
-    'slice_double',
+    'slice_grad',
+    'gather_nd',
+    'stack',
     'poisson',
     'gumbel_softmax',
     'tril',
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 522fcda823ebb..b67f3e83de952 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -72,6 +72,10 @@
     'layer_norm_grad',
     'embedding_grad',
     'scale_grad',
+    'gather_nd_grad',
+    'stack_grad',
+    'squeeze_grad',
+    'unsqueeze_grad',
     'poisson_grad',
     'gumbel_softmax_grad',
 ]
@@ -173,6 +177,10 @@
     'gumbel_softmax_grad',
     'split',
     'transpose',
+    'gather_nd_grad',
+    'stack_grad',
+    'squeeze_grad',
+    'unsqueeze_grad',
 ]
 
 
diff --git a/test/legacy_test/test_gather_nd_op.py b/test/legacy_test/test_gather_nd_op.py
index dd1d996715eef..a10faff2ac1f3 100644
--- a/test/legacy_test/test_gather_nd_op.py
+++ b/test/legacy_test/test_gather_nd_op.py
@@ -56,7 +56,7 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 class TestGatherNdOpWithEmptyIndexFP16(TestGatherNdOpWithEmptyIndex):
@@ -80,7 +80,7 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=False
+            place, ['X'], 'Out', check_prim=True, check_new_ir=True
         )
 
 
@@ -117,7 +117,7 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 class TestGatherNdOpWithIndex1_ZeroDim(TestGatherNdOpWithIndex1):
@@ -168,7 +168,7 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=False
+            place, ['X'], 'Out', check_prim=True, check_new_ir=True
         )
 
 
@@ -205,7 +205,7 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 class TestGatherNdOpWithLowIndexFP16(TestGatherNdOpWithLowIndex):
@@ -233,7 +233,7 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=False,
+            check_new_ir=True,
             numeric_grad_delta=0.5,
         )
 
@@ -280,7 +280,7 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=False,
+            check_new_ir=True,
             numeric_grad_delta=0.05,
         )
 
@@ -310,7 +310,7 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=False,
+            check_new_ir=True,
             numeric_grad_delta=0.5,
         )
 
@@ -345,7 +345,7 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 class TestGatherNdOpWithSameIndexAsXFP16(TestGatherNdOpWithSameIndexAsX):
@@ -373,7 +373,7 @@ def test_check_grad(self):
             ['X'],
             'Out',
             check_prim=True,
-            check_new_ir=False,
+            check_new_ir=True,
             numeric_grad_delta=0.5,
         )
 
@@ -410,7 +410,7 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 class TestGatherNdOpWithHighRankSameFP16(TestGatherNdOpWithHighRankSame):
@@ -434,7 +434,7 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=False
+            place, ['X'], 'Out', check_prim=True, check_new_ir=True
         )
 
 
@@ -471,7 +471,7 @@ def test_check_output(self):
         self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=False)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 class TestGatherNdOpWithHighRankDiffFP16(TestGatherNdOpWithHighRankDiff):
@@ -495,7 +495,7 @@ def test_check_output(self):
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', check_prim=True, check_new_ir=False
+            place, ['X'], 'Out', check_prim=True, check_new_ir=True
         )
 
 

From a5d27ea4fcc5068de1ef4c007ef12188ddfd9eb6 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Mon, 25 Sep 2023 09:39:03 +0800
Subject: [PATCH 087/115] =?UTF-8?q?=E3=80=90PIR=E3=80=91modify=20Split=20a?=
 =?UTF-8?q?pi=20pir=20pretreatment=20(#57618)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify ci bug

* modify split api

* modify

* modify
---
 python/paddle/tensor/manipulation.py | 42 +++++++++++++++++-----------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 7fc8dc49b8c3a..933ea0a7651a0 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -35,6 +35,7 @@
     dygraph_only,
     in_dynamic_mode,
     in_dynamic_or_pir_mode,
+    in_pir_mode,
 )
 from .creation import _complex_to_real_dtype, _real_to_complex_dtype, zeros
 
@@ -1956,30 +1957,37 @@ def split(x, num_or_sections, axis=0, name=None):
     """
     input = x
     dim = axis
-    if in_dynamic_or_pir_mode():
-        if in_dynamic_mode():
-            if isinstance(dim, Variable):
-                dim = dim.item(0)
+    if in_dynamic_mode():
+        if isinstance(dim, Variable):
+            dim = dim.item(0)
+        assert dim + len(input.shape) >= 0, "(rank(x) + axis) must >= 0"
+        dim = (dim + len(input.shape)) if dim < 0 else dim
+
+        if isinstance(num_or_sections, (list, tuple)):
+            if paddle.utils._contain_var(num_or_sections):
+                for index, item in enumerate(num_or_sections):
+                    if isinstance(item, Variable):
+                        num_or_sections[index] = num_or_sections[index].item()
+        elif not isinstance(num_or_sections, int):
+            raise TypeError(
+                "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
+                "received %s." % (type(num_or_sections))
+            )
+
+        if isinstance(num_or_sections, int):
+            return _C_ops.split_with_num(input, num_or_sections, dim)
+        else:
+            return _C_ops.split(input, num_or_sections, dim)
+    elif in_pir_mode():
+        if isinstance(dim, int):
             assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0"
             dim = (len(input.shape) + dim) if dim < 0 else dim
 
-            if isinstance(num_or_sections, (list, tuple)):
-                if paddle.utils._contain_var(num_or_sections):
-                    for index, item in enumerate(num_or_sections):
-                        if isinstance(item, Variable):
-                            num_or_sections[index] = num_or_sections[
-                                index
-                            ].item()
-            elif not isinstance(num_or_sections, int):
-                raise TypeError(
-                    "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
-                    "received %s." % (type(num_or_sections))
-                )
-
         if isinstance(num_or_sections, int):
             return _C_ops.split_with_num(input, num_or_sections, dim)
         else:
             return _C_ops.split(input, num_or_sections, dim)
+
     else:
         check_variable_and_dtype(
             input,

From 404d7ac3cd67a6205f9df7db0989479870f2d6b2 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Mon, 25 Sep 2023 10:20:04 +0800
Subject: [PATCH 088/115] [PIR] Refine Ir op build (#57646)

* refine

* add code

* fix bug

* fix

* fix
---
 .../pir/dialect/op_generator/op_build_gen.py  |   4 +-
 .../pir/dialect/operator/ir/manual_op.cc      | 247 +++++++-----------
 .../pir/dialect/operator/ir/meta_tensor.h     |  12 +-
 paddle/phi/core/meta_tensor.cc                |  13 +-
 4 files changed, 119 insertions(+), 157 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index 33bb81e43bf64..ae2a60b8b866c 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -405,10 +405,10 @@ def GenBuildOutputs(
     {name}.SetFromTensor(true);
   }}\n"""
 
-    CREATE_OUTPUT_METATENSOR_TEMPLATE = """  phi::DenseTensor dense_{name};
+    CREATE_OUTPUT_METATENSOR_TEMPLATE = """  paddle::dialect::IrMetaTensor dense_{name};
   phi::MetaTensor meta_{name}(&dense_{name});
 """
-    CREATE_OUTPUT_VEC_METATENSOR_TEMPLATE = """  std::vector<phi::DenseTensor> vec_dense_{name}(({output_size}), phi::DenseTensor());
+    CREATE_OUTPUT_VEC_METATENSOR_TEMPLATE = """  std::vector<paddle::dialect::IrMetaTensor> vec_dense_{name}(({output_size}), paddle::dialect::IrMetaTensor());
   std::vector<phi::MetaTensor> vec_meta_{name};
   for (size_t i=0; i < static_cast<size_t>({output_size}); i++) {{
     vec_meta_{name}.push_back(phi::MetaTensor(&vec_dense_{name}[i]));
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index bb9c8474a373f..17c183ccd7bb2 100644
--- a/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/meta_tensor.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
@@ -111,21 +112,16 @@ void AddNOp::Build(pir::Builder &builder,             // NOLINT
 
   VLOG(4) << "Builder construction outputs";
   pir::VectorType x = inputs.type().dyn_cast<pir::VectorType>();
-  (void)x;
 
-  std::vector<phi::DenseTensor> vec_dense_x;
+  std::vector<paddle::dialect::IrMetaTensor> vec_dense_x;
   for (size_t i = 0; i < x.size(); i++) {
-    vec_dense_x.push_back(phi::DenseTensor(
-        std::make_unique<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace())
-            .get(),
-        phi::DenseTensorMeta(
-            TransToPhiDataType(
-                x[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
-            x[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
-            x[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
-            x[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
-            x[i].dyn_cast<paddle::dialect::DenseTensorType>().offset())));
+    vec_dense_x.push_back(paddle::dialect::IrMetaTensor(
+        TransToPhiDataType(
+            x[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
+        x[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
+        x[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
+        x[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
+        x[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
   }
   std::vector<phi::MetaTensor> vec_meta_x;
   for (size_t i = 0; i < vec_dense_x.size(); i++) {
@@ -136,7 +132,8 @@ void AddNOp::Build(pir::Builder &builder,             // NOLINT
   for (size_t i = 0; i < static_cast<size_t>(vec_meta_x.size()); i++) {
     meta_x.push_back(&vec_meta_x[i]);
   }
-  phi::DenseTensor dense_out;
+
+  paddle::dialect::IrMetaTensor dense_out;
   phi::MetaTensor meta_out(&dense_out);
 
   phi::AddNInferMeta(meta_x, &meta_out);
@@ -189,21 +186,15 @@ void AddN_Op::Build(pir::Builder &builder,
 
   VLOG(4) << "Builder construction outputs";
   pir::VectorType inputs = inputs_.type().dyn_cast<pir::VectorType>();
-  std::vector<phi::DenseTensor> vec_dense_inputs;
+  std::vector<paddle::dialect::IrMetaTensor> vec_dense_inputs;
   for (size_t i = 0; i < static_cast<size_t>(inputs.size()); i++) {
-    vec_dense_inputs.push_back(phi::DenseTensor(
-        std::make_unique<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace())
-            .get(),
-        phi::DenseTensorMeta(
-            paddle::dialect::TransToPhiDataType(
-                inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
-            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
-            inputs[i]
-                .dyn_cast<paddle::dialect::DenseTensorType>()
-                .data_layout(),
-            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
-            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().offset())));
+    vec_dense_inputs.push_back(paddle::dialect::IrMetaTensor(
+        paddle::dialect::TransToPhiDataType(
+            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
+        inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
+        inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
+        inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
+        inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
   }
   std::vector<phi::MetaTensor> vec_meta_inputs;
   for (size_t i = 0; i < vec_dense_inputs.size(); i++) {
@@ -214,7 +205,7 @@ void AddN_Op::Build(pir::Builder &builder,
   for (size_t i = 0; i < static_cast<size_t>(vec_meta_inputs.size()); i++) {
     meta_inputs.push_back(&vec_meta_inputs[i]);
   }
-  phi::DenseTensor dense_out;
+  paddle::dialect::IrMetaTensor dense_out;
   phi::MetaTensor meta_out(&dense_out);
 
   phi::AddNInferMeta(meta_inputs, &meta_out);
@@ -318,21 +309,15 @@ void AddNWithKernelOp::Build(pir::Builder &builder,
 
   VLOG(4) << "Builder construction outputs";
   pir::VectorType inputs = inputs_.type().dyn_cast<pir::VectorType>();
-  std::vector<phi::DenseTensor> vec_dense_inputs;
+  std::vector<paddle::dialect::IrMetaTensor> vec_dense_inputs;
   for (size_t i = 0; i < static_cast<size_t>(inputs.size()); i++) {
-    vec_dense_inputs.push_back(phi::DenseTensor(
-        std::make_unique<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace())
-            .get(),
-        phi::DenseTensorMeta(
-            paddle::dialect::TransToPhiDataType(
-                inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
-            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
-            inputs[i]
-                .dyn_cast<paddle::dialect::DenseTensorType>()
-                .data_layout(),
-            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
-            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().offset())));
+    vec_dense_inputs.push_back(paddle::dialect::IrMetaTensor(
+        paddle::dialect::TransToPhiDataType(
+            inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
+        inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
+        inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
+        inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
+        inputs[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
   }
   std::vector<phi::MetaTensor> vec_meta_inputs;
   for (size_t i = 0; i < vec_dense_inputs.size(); i++) {
@@ -343,7 +328,7 @@ void AddNWithKernelOp::Build(pir::Builder &builder,
   for (size_t i = 0; i < static_cast<size_t>(vec_meta_inputs.size()); i++) {
     meta_inputs.push_back(&vec_meta_inputs[i]);
   }
-  phi::DenseTensor dense_out;
+  paddle::dialect::IrMetaTensor dense_out;
   phi::MetaTensor meta_out(&dense_out);
 
   phi::AddNInferMeta(meta_inputs, &meta_out);
@@ -508,46 +493,37 @@ void FusedGemmEpilogueOp::Build(pir::Builder &builder,
   (void)bias;
 
   VLOG(4) << "Builder construction  dense_x";
-  phi::DenseTensor dense_x(
-      std::make_unique<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace())
-          .get(),
-      phi::DenseTensorMeta(paddle::dialect::TransToPhiDataType(x.dtype()),
-                           x.dims(),
-                           x.data_layout(),
-                           x.lod(),
-                           x.offset()));
+  paddle::dialect::IrMetaTensor dense_x(
+      paddle::dialect::TransToPhiDataType(x.dtype()),
+      x.dims(),
+      x.data_layout(),
+      x.lod(),
+      x.offset());
   VLOG(4) << "Builder construction  meta_x";
   phi::MetaTensor meta_x(&dense_x);
 
   VLOG(4) << "Builder construction  dense_y";
-  phi::DenseTensor dense_y(
-      std::make_unique<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace())
-          .get(),
-      phi::DenseTensorMeta(paddle::dialect::TransToPhiDataType(y.dtype()),
-                           y.dims(),
-                           y.data_layout(),
-                           y.lod(),
-                           y.offset()));
+  paddle::dialect::IrMetaTensor dense_y(
+      paddle::dialect::TransToPhiDataType(y.dtype()),
+      y.dims(),
+      y.data_layout(),
+      y.lod(),
+      y.offset());
   VLOG(4) << "Builder construction  meta_y";
   phi::MetaTensor meta_y(&dense_y);
 
   VLOG(4) << "Builder construction  dense_bias";
-  phi::DenseTensor dense_bias(
-      std::make_unique<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace())
-          .get(),
-      phi::DenseTensorMeta(paddle::dialect::TransToPhiDataType(bias.dtype()),
-                           bias.dims(),
-                           bias.data_layout(),
-                           bias.lod(),
-                           bias.offset()));
+  paddle::dialect::IrMetaTensor dense_bias(
+      paddle::dialect::TransToPhiDataType(bias.dtype()),
+      bias.dims(),
+      bias.data_layout(),
+      bias.lod(),
+      bias.offset());
   VLOG(4) << "Builder construction  meta_bias";
   phi::MetaTensor meta_bias(&dense_bias);
-  phi::DenseTensor dense_out;
+  paddle::dialect::IrMetaTensor dense_out;
   phi::MetaTensor meta_out(&dense_out);
-  phi::DenseTensor dense_reserve_space;
+  paddle::dialect::IrMetaTensor dense_reserve_space;
   phi::MetaTensor meta_reserve_space(&dense_reserve_space);
 
   phi::FusedGemmEpilogueInferMeta(
@@ -768,66 +744,52 @@ void FusedGemmEpilogueGradOp::Build(pir::Builder &builder,
   (void)out_grad;
 
   VLOG(4) << "Builder construction  dense_x";
-  phi::DenseTensor dense_x(
-      std::make_unique<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace())
-          .get(),
-      phi::DenseTensorMeta(paddle::dialect::TransToPhiDataType(x.dtype()),
-                           x.dims(),
-                           x.data_layout(),
-                           x.lod(),
-                           x.offset()));
+  paddle::dialect::IrMetaTensor dense_x(
+      paddle::dialect::TransToPhiDataType(x.dtype()),
+      x.dims(),
+      x.data_layout(),
+      x.lod(),
+      x.offset());
   VLOG(4) << "Builder construction  meta_x";
   phi::MetaTensor meta_x(&dense_x);
 
   VLOG(4) << "Builder construction  dense_y";
-  phi::DenseTensor dense_y(
-      std::make_unique<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace())
-          .get(),
-      phi::DenseTensorMeta(paddle::dialect::TransToPhiDataType(y.dtype()),
-                           y.dims(),
-                           y.data_layout(),
-                           y.lod(),
-                           y.offset()));
+  paddle::dialect::IrMetaTensor dense_y(
+      paddle::dialect::TransToPhiDataType(y.dtype()),
+      y.dims(),
+      y.data_layout(),
+      y.lod(),
+      y.offset());
   VLOG(4) << "Builder construction  meta_y";
   phi::MetaTensor meta_y(&dense_y);
 
   VLOG(4) << "Builder construction  dense_reserve_space";
-  std::unique_ptr<phi::DenseTensor> dense_reserve_space =
+  std::unique_ptr<paddle::dialect::IrMetaTensor> dense_reserve_space =
       reserve_space_
-          ? std::make_unique<phi::DenseTensor>(
-                std::make_unique<paddle::experimental::DefaultAllocator>(
-                    paddle::platform::CPUPlace())
-                    .get(),
-                phi::DenseTensorMeta(
-                    paddle::dialect::TransToPhiDataType(reserve_space.dtype()),
-                    reserve_space.dims(),
-                    reserve_space.data_layout(),
-                    reserve_space.lod(),
-                    reserve_space.offset()))
+          ? std::make_unique<paddle::dialect::IrMetaTensor>(
+                paddle::dialect::TransToPhiDataType(reserve_space.dtype()),
+                reserve_space.dims(),
+                reserve_space.data_layout(),
+                reserve_space.lod(),
+                reserve_space.offset())
           : nullptr;
   VLOG(4) << "Builder construction  meta_reserve_space";
   phi::MetaTensor meta_reserve_space(dense_reserve_space.get());
 
   VLOG(4) << "Builder construction  dense_out_grad";
-  phi::DenseTensor dense_out_grad(
-      std::make_unique<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace())
-          .get(),
-      phi::DenseTensorMeta(
-          paddle::dialect::TransToPhiDataType(out_grad.dtype()),
-          out_grad.dims(),
-          out_grad.data_layout(),
-          out_grad.lod(),
-          out_grad.offset()));
+  paddle::dialect::IrMetaTensor dense_out_grad(
+      paddle::dialect::TransToPhiDataType(out_grad.dtype()),
+      out_grad.dims(),
+      out_grad.data_layout(),
+      out_grad.lod(),
+      out_grad.offset());
   VLOG(4) << "Builder construction  meta_out_grad";
   phi::MetaTensor meta_out_grad(&dense_out_grad);
-  phi::DenseTensor dense_x_grad;
+  paddle::dialect::IrMetaTensor dense_x_grad;
   phi::MetaTensor meta_x_grad(&dense_x_grad);
-  phi::DenseTensor dense_y_grad;
+  paddle::dialect::IrMetaTensor dense_y_grad;
   phi::MetaTensor meta_y_grad(&dense_y_grad);
-  phi::DenseTensor dense_bias_grad;
+  paddle::dialect::IrMetaTensor dense_bias_grad;
   phi::MetaTensor meta_bias_grad(&dense_bias_grad);
 
   phi::FusedGemmEpilogueGradInferMeta(meta_x,
@@ -929,25 +891,15 @@ void SplitGradOp::Build(pir::Builder &builder,
 
   VLOG(4) << "Builder construction outputs";
   pir::VectorType out_grad = out_grad_.type().dyn_cast<pir::VectorType>();
-  std::vector<phi::DenseTensor> vec_dense_out_grad;
+  std::vector<paddle::dialect::IrMetaTensor> vec_dense_out_grad;
   for (size_t i = 0; i < static_cast<size_t>(out_grad.size()); i++) {
-    vec_dense_out_grad.push_back(phi::DenseTensor(
-        std::make_unique<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace())
-            .get(),
-        phi::DenseTensorMeta(
-            paddle::dialect::TransToPhiDataType(
-                out_grad[i]
-                    .dyn_cast<paddle::dialect::DenseTensorType>()
-                    .dtype()),
-            out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
-            out_grad[i]
-                .dyn_cast<paddle::dialect::DenseTensorType>()
-                .data_layout(),
-            out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
-            out_grad[i]
-                .dyn_cast<paddle::dialect::DenseTensorType>()
-                .offset())));
+    vec_dense_out_grad.push_back(paddle::dialect::IrMetaTensor(
+        paddle::dialect::TransToPhiDataType(
+            out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
+        out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
+        out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
+        out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
+        out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
   }
   std::vector<phi::MetaTensor> vec_meta_out_grad;
   for (size_t i = 0; i < vec_dense_out_grad.size(); i++) {
@@ -958,7 +910,7 @@ void SplitGradOp::Build(pir::Builder &builder,
   for (size_t i = 0; i < static_cast<size_t>(vec_meta_out_grad.size()); i++) {
     meta_out_grad.push_back(&vec_meta_out_grad[i]);
   }
-  phi::DenseTensor dense_x_grad;
+  paddle::dialect::IrMetaTensor dense_x_grad;
   phi::MetaTensor meta_x_grad(&dense_x_grad);
 
   phi::ConcatInferMeta(meta_out_grad, axis, &meta_x_grad);
@@ -995,24 +947,15 @@ void SplitGradOp::Build(pir::Builder &builder,
                  .data()
                  .to<int>();
 
-  std::vector<phi::DenseTensor> vec_dense_out_grad;
+  std::vector<paddle::dialect::IrMetaTensor> vec_dense_out_grad;
   for (size_t i = 0; i < static_cast<size_t>(out_grad.size()); i++) {
-    vec_dense_out_grad.push_back(phi::DenseTensor(
-        std::make_unique<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace())
-            .get(),
-        phi::DenseTensorMeta(
-            TransToPhiDataType(out_grad[i]
-                                   .dyn_cast<paddle::dialect::DenseTensorType>()
-                                   .dtype()),
-            out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
-            out_grad[i]
-                .dyn_cast<paddle::dialect::DenseTensorType>()
-                .data_layout(),
-            out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
-            out_grad[i]
-                .dyn_cast<paddle::dialect::DenseTensorType>()
-                .offset())));
+    vec_dense_out_grad.push_back(paddle::dialect::IrMetaTensor(
+        TransToPhiDataType(
+            out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
+        out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().dims(),
+        out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().data_layout(),
+        out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().lod(),
+        out_grad[i].dyn_cast<paddle::dialect::DenseTensorType>().offset()));
   }
   std::vector<phi::MetaTensor> vec_meta_out_grad;
   for (size_t i = 0; i < vec_dense_out_grad.size(); i++) {
@@ -1023,7 +966,7 @@ void SplitGradOp::Build(pir::Builder &builder,
   for (size_t i = 0; i < static_cast<size_t>(vec_meta_out_grad.size()); i++) {
     meta_out_grad.push_back(&vec_meta_out_grad[i]);
   }
-  phi::DenseTensor dense_x_grad;
+  paddle::dialect::IrMetaTensor dense_x_grad;
   phi::MetaTensor meta_x_grad(&dense_x_grad);
 
   phi::ConcatInferMeta(meta_out_grad, axis, &meta_x_grad);
diff --git a/paddle/fluid/pir/dialect/operator/ir/meta_tensor.h b/paddle/fluid/pir/dialect/operator/ir/meta_tensor.h
index dbc566af2e4fe..a82cf22d63cf1 100644
--- a/paddle/fluid/pir/dialect/operator/ir/meta_tensor.h
+++ b/paddle/fluid/pir/dialect/operator/ir/meta_tensor.h
@@ -51,14 +51,22 @@ class IrMetaTensor : public phi::TensorBase,
 
   const phi::DDim& dims() const noexcept override { return dims_; }
 
+  void SetDims(const phi::DDim& dims) { dims_ = dims; }
+
   const phi::Place& place() const override;
 
   phi::DataType dtype() const noexcept override { return dtype_; }
 
+  void SetDtype(phi::DataType dtype) { dtype_ = dtype; }
+
   phi::DataLayout layout() const noexcept override { return layout_; }
 
+  void SetLayout(phi::DataLayout layout) { layout_ = layout; }
+
   const LoD& lod() const noexcept { return lod_; }
 
+  void SetLod(LoD lod) { lod_ = lod; }
+
   size_t offset() const noexcept { return offset_; }
 
   bool valid() const noexcept override { return true; }
@@ -72,8 +80,8 @@ class IrMetaTensor : public phi::TensorBase,
 
  private:
   phi::DDim dims_;
-  phi::DataType dtype_{phi::DataType::UNDEFINED};
-  phi::DataLayout layout_{phi::DataLayout::NCHW};
+  phi::DataType dtype_{phi::DataType::FLOAT32};
+  phi::DataLayout layout_{phi::DataLayout::ANY};
   LoD lod_;
   size_t offset_{0};
 };
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 53cba02ab0765..7f156463ca17b 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -75,6 +75,8 @@ void MetaTensor::set_dims(const DDim& dims) {
     if (!strided_kernel_used_) {
       meta->strides = meta->calc_strides(dims);
     }
+  } else if (paddle::dialect::IrMetaTensor::classof(tensor_)) {
+    static_cast<paddle::dialect::IrMetaTensor*>(tensor_)->SetDims(dims);
   } else if (phi::StringTensor::classof(tensor_)) {
     StringTensorUtils::GetMutableMeta(static_cast<StringTensor*>(tensor_))
         ->dims = dims;
@@ -107,6 +109,8 @@ void MetaTensor::set_dtype(DataType dtype) {
   if (phi::DenseTensor::classof(tensor_)) {
     DenseTensorUtils::GetMutableMeta(static_cast<DenseTensor*>(tensor_))
         ->dtype = dtype;
+  } else if (paddle::dialect::IrMetaTensor::classof(tensor_)) {
+    static_cast<paddle::dialect::IrMetaTensor*>(tensor_)->SetDtype(dtype);
   } else if (phi::StringTensor::classof(tensor_)) {
     // No need to set dtype
   } else if (phi::SelectedRows::classof(tensor_)) {
@@ -136,6 +140,8 @@ void MetaTensor::set_layout(DataLayout layout) {
     if (!strided_kernel_used_) {
       meta->strides = meta->calc_strides(meta->dims);
     }
+  } else if (paddle::dialect::IrMetaTensor::classof(tensor_)) {
+    static_cast<paddle::dialect::IrMetaTensor*>(tensor_)->SetLayout(layout);
   } else if (phi::StringTensor::classof(tensor_)) {
     // No need to set layout
   } else if (phi::SelectedRows::classof(tensor_)) {
@@ -178,6 +184,9 @@ void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
     DenseTensorUtils::GetMutableMeta(
         static_cast<SelectedRows*>(tensor_)->mutable_value())
         ->lod = meta_tensor.lod();
+  } else if (paddle::dialect::IrMetaTensor::classof(tensor_)) {
+    static_cast<paddle::dialect::IrMetaTensor*>(tensor_)->SetLod(
+        meta_tensor.lod());
   } else {
     PADDLE_THROW(
         phi::errors::Unimplemented("Unsupported sharing lod inplace for `%s`.",
@@ -188,6 +197,7 @@ void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
 void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   ValidCheck(*this);
   if (phi::DenseTensor::classof(tensor_) ||
+      paddle::dialect::IrMetaTensor::classof(tensor_) ||
       phi::SelectedRows::classof(tensor_) ||
       phi::SparseCooTensor::classof(tensor_) ||
       phi::SparseCsrTensor::classof(tensor_) ||
@@ -224,8 +234,9 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
   bool is_sparse_coo = phi::SparseCooTensor::classof(tensor_);
   bool is_sparse_csr = phi::SparseCsrTensor::classof(tensor_);
   bool is_dist_tensor = phi::distributed::DistTensor::classof(tensor_);
+  bool is_ir_meta_tensor = paddle::dialect::IrMetaTensor::classof(tensor_);
   if (is_dense_tensor || is_selected_rows || is_sparse_coo || is_sparse_csr ||
-      is_dist_tensor) {
+      is_dist_tensor || is_ir_meta_tensor) {
     if (is_selected_rows) {
       const auto in_tensor_base = meta_tensor.tensor();
       PADDLE_ENFORCE_EQ(

From 87b5c9f4b0e5c6dc610651ca3d149424edf2e9d4 Mon Sep 17 00:00:00 2001
From: iLeGend <824040212@qq.com>
Date: Mon, 25 Sep 2023 10:44:35 +0800
Subject: [PATCH 089/115] [Doctest]fix No.219-223, test=docs_preview (#57601)

---
 .../distributed/fleet/dataset/dataset.py      | 835 +++++++++---------
 .../distributed/fleet/layers/mpu/mp_layers.py | 181 ++--
 .../distributed/fleet/layers/mpu/mp_ops.py    |  26 +-
 .../parallel_layers/pp_layers.py              | 123 +--
 .../distributed/fleet/metrics/metric.py       | 148 ++--
 5 files changed, 673 insertions(+), 640 deletions(-)

diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index bdaa8cbb5ea08..d0c7ca3b7b644 100755
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -79,9 +79,9 @@ def _set_pipe_command(self, pipe_command):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.dataset.DatasetBase()
-              dataset._set_pipe_command("python my_script.py")
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.dataset.DatasetBase()
+                >>> dataset._set_pipe_command("python my_script.py")
 
         Args:
             pipe_command(str): pipe command
@@ -96,9 +96,9 @@ def _set_batch_size(self, batch_size):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.DatasetBase()
-              dataset._set_batch_size(128)
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> dataset._set_batch_size(128)
 
         Args:
             batch_size(int): batch size
@@ -113,9 +113,9 @@ def _set_thread(self, thread_num):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.DatasetBase()
-              dataset._set_thread(12)
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> dataset._set_thread(12)
 
         Args:
             thread_num(int): thread num
@@ -130,9 +130,9 @@ def set_filelist(self, filelist):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.DatasetBase()
-              dataset.set_filelist(['a.txt', 'b.txt'])
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> dataset.set_filelist(['a.txt', 'b.txt'])
 
         Args:
             filelist(list[str]): list of file names of inputs.
@@ -150,9 +150,9 @@ def _set_uid_slot(self, uid_slot):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.DatasetBase()
-              dataset._set_uid_slot('6048')
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> dataset._set_uid_slot('6048')
 
         Args:
             set_uid_slot(string): user slot name
@@ -167,9 +167,9 @@ def _set_use_var(self, var_list):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.DatasetBase()
-              dataset._set_use_var([data, label])
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> dataset._set_use_var([data, label])
 
         Args:
             var_list(list): variable list
@@ -198,9 +198,9 @@ def _set_hdfs_config(self, fs_name, fs_ugi):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.DatasetBase()
-              dataset._set_hdfs_config("my_fs_name", "my_fs_ugi")
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> dataset._set_hdfs_config("my_fs_name", "my_fs_ugi")
 
         Args:
             fs_name(str): fs name
@@ -215,9 +215,9 @@ def _set_download_cmd(self, download_cmd):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.DatasetBase()
-              dataset._set_download_cmd("./read_from_afs")
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> dataset._set_download_cmd("./read_from_afs")
 
         Args:
             download_cmd(str): customized download command
@@ -259,9 +259,10 @@ def _desc(self):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.DatasetBase()
-              print(dataset._desc())
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> print(dataset._desc())
+                pipe_command: "cat"
 
         Returns:
             A string message
@@ -283,12 +284,12 @@ def _check_use_var_with_data_generator(
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle
-              from dataset_generator import CTRDataset
-              dataset = paddle.distributed.fleet.DatasetBase()
-              generator_class = CTRDataset()
-              dataset._check_use_var_with_data_generator([data, label], generator_class, "data/part-00000")
+                >>> # doctest: +SKIP('need to work with real dataset')
+                >>> import paddle
+                >>> from dataset_generator import CTRDataset
+                >>> dataset = paddle.distributed.fleet.DatasetBase()
+                >>> generator_class = CTRDataset()
+                >>> dataset._check_use_var_with_data_generator([data, label], generator_class, "data/part-00000")
 
         Args:
             var_list(list): variable list
@@ -357,9 +358,9 @@ class InMemoryDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
-            dataset = paddle.distributed.InMemoryDataset()
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> dataset = paddle.distributed.InMemoryDataset()
 
     """
 
@@ -400,20 +401,21 @@ def _init_distributed_settings(self, **kwargs):
         Examples:
             .. code-block:: python
 
-              import paddle
-              paddle.enable_static()
-              dataset = paddle.distributed.InMemoryDataset()
-              dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=[])
-              dataset._init_distributed_settings(
-                    parse_ins_id=True,
-                    parse_content=True,
-                    fea_eval=True,
-                    candidate_size=10000)
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=[])
+
+                >>> dataset._init_distributed_settings(
+                ...     parse_ins_id=True,
+                ...     parse_content=True,
+                ...     fea_eval=True,
+                ...     candidate_size=10000)
 
         """
         merge_size = kwargs.get("merge_size", -1)
@@ -473,22 +475,22 @@ def update_settings(self, **kwargs):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
+                >>> import paddle
+                >>> paddle.enable_static()
 
-                dataset = paddle.distributed.InMemoryDataset()
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=[])
-                dataset._init_distributed_settings(
-                    parse_ins_id=True,
-                    parse_content=True,
-                    fea_eval=True,
-                    candidate_size=10000)
-                dataset.update_settings(batch_size=2)
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=[])
+                >>> dataset._init_distributed_settings(
+                ...     parse_ins_id=True,
+                ...     parse_content=True,
+                ...     fea_eval=True,
+                ...     candidate_size=10000)
+                >>> dataset.update_settings(batch_size=2)
 
         """
         for key in kwargs:
@@ -543,45 +545,44 @@ def init(self, **kwargs):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import os
-                paddle.enable_static()
-
-                with open("test_queue_dataset_run_a.txt", "w") as f:
-                    data = "2 1 2 2 5 4 2 2 7 2 1 3"
-                    f.write(data)
-                with open("test_queue_dataset_run_b.txt", "w") as f:
-                    data = "2 1 2 2 5 4 2 2 7 2 1 3"
-                    f.write(data)
-
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-
-                dataset = paddle.distributed.InMemoryDataset()
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                dataset.set_filelist(
-                    ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-                dataset.load_into_memory()
-
-                place = paddle.CPUPlace()
-                exe = paddle.static.Executor(place)
-                startup_program = paddle.static.Program()
-                main_program = paddle.static.Program()
-                exe.run(startup_program)
-
-                exe.train_from_dataset(main_program, dataset)
-
-                os.remove("./test_queue_dataset_run_a.txt")
-                os.remove("./test_queue_dataset_run_b.txt")
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> import os
+                >>> paddle.enable_static()
+
+                >>> with open("test_queue_dataset_run_a.txt", "w") as f:
+                ...     data = "2 1 2 2 5 4 2 2 7 2 1 3"
+                ...     f.write(data)
+                >>> with open("test_queue_dataset_run_b.txt", "w") as f:
+                ...     data = "2 1 2 2 5 4 2 2 7 2 1 3"
+                ...     f.write(data)
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> dataset.set_filelist(
+                ...     ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
+                >>> dataset.load_into_memory()
+
+                >>> place = paddle.CPUPlace()
+                >>> exe = paddle.static.Executor(place)
+                >>> startup_program = paddle.static.Program()
+                >>> main_program = paddle.static.Program()
+                >>> exe.run(startup_program)
+
+                >>> exe.train_from_dataset(main_program, dataset)
+
+                >>> os.remove("./test_queue_dataset_run_a.txt")
+                >>> os.remove("./test_queue_dataset_run_b.txt")
 
         """
         batch_size = kwargs.get("batch_size", 1)
@@ -668,10 +669,10 @@ def _set_queue_num(self, queue_num):
         Examples:
             .. code-block:: python
 
-              import paddle
-              paddle.enable_static()
-              dataset = paddle.distributed.InMemoryDataset()
-              dataset._set_queue_num(12)
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset._set_queue_num(12)
 
         """
         self.is_user_set_queue_num = True
@@ -687,10 +688,10 @@ def _set_parse_ins_id(self, parse_ins_id):
         Examples:
             .. code-block:: python
 
-              import paddle
-              paddle.enable_static()
-              dataset = paddle.distributed.InMemoryDataset()
-              dataset._set_parse_ins_id(True)
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset._set_parse_ins_id(True)
 
         """
         self.parse_ins_id = parse_ins_id
@@ -705,10 +706,10 @@ def _set_parse_content(self, parse_content):
         Examples:
             .. code-block:: python
 
-              import paddle
-              paddle.enable_static()
-              dataset = paddle.distributed.InMemoryDataset()
-              dataset._set_parse_content(True)
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset._set_parse_content(True)
 
         """
         self.parse_content = parse_content
@@ -723,10 +724,10 @@ def _set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
         Examples:
             .. code-block:: python
 
-              import paddle
-              paddle.enable_static()
-              dataset = paddle.distributed.InMemoryDataset()
-              dataset._set_fleet_send_batch_size(800)
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset._set_fleet_send_batch_size(800)
 
         """
         self.fleet_send_batch_size = fleet_send_batch_size
@@ -741,10 +742,10 @@ def _set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
         Examples:
             .. code-block:: python
 
-              import paddle
-              paddle.enable_static()
-              dataset = paddle.distributed.InMemoryDataset()
-              dataset._set_fleet_send_sleep_seconds(2)
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset._set_fleet_send_sleep_seconds(2)
 
         """
         self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
@@ -760,10 +761,10 @@ def _set_merge_by_lineid(self, merge_size=2):
         Examples:
             .. code-block:: python
 
-              import paddle
-              paddle.enable_static()
-              dataset = paddle.distributed.InMemoryDataset()
-              dataset._set_merge_by_lineid()
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset._set_merge_by_lineid()
 
         """
         self.dataset.set_merge_by_lineid(merge_size)
@@ -780,10 +781,11 @@ def _set_shuffle_by_uid(self, enable_shuffle_uid):
         Examples:
             .. code-block:: python
 
-              import paddle
-              paddle.enable_static()
-              dataset = paddle.distributed.InMemoryDataset()
-              dataset._set_shuffle_by_uid(True)
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset._set_shuffle_by_uid(True)
+
         """
         self.dataset.set_shuffle_by_uid(enable_shuffle_uid)
 
@@ -811,23 +813,24 @@ def set_date(self, date):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                dataset.set_date("20211111")
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> dataset.set_date("20211111")
+
         """
         year = int(date[:4])
         month = int(date[4:6])
@@ -867,25 +870,27 @@ def load_into_memory(self, is_shuffle=False):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.load_into_memory()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+
         """
         self._prepare_to_run()
         if not self.use_ps_gpu:
@@ -906,26 +911,28 @@ def preload_into_memory(self, thread_num=None):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.preload_into_memory()
-                dataset.wait_preload_done()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.preload_into_memory()
+                >>> dataset.wait_preload_done()
+
         """
         self._prepare_to_run()
         if thread_num is None:
@@ -943,26 +950,28 @@ def wait_preload_done(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.preload_into_memory()
-                dataset.wait_preload_done()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.preload_into_memory()
+                >>> dataset.wait_preload_done()
+
         """
         self.dataset.wait_preload_done()
         self.dataset.destroy_preload_readers()
@@ -976,26 +985,28 @@ def local_shuffle(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.load_into_memory()
-                dataset.local_shuffle()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.local_shuffle()
+
         """
         self.dataset.local_shuffle()
 
@@ -1011,26 +1022,27 @@ def global_shuffle(self, fleet=None, thread_num=12):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.load_into_memory()
-                dataset.global_shuffle()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.global_shuffle()
 
         Args:
             fleet(Fleet): fleet singleton. Default None.
@@ -1068,32 +1080,33 @@ def release_memory(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.load_into_memory()
-                dataset.global_shuffle()
-                exe = paddle.static.Executor(paddle.CPUPlace())
-                startup_program = paddle.static.Program()
-                main_program = paddle.static.Program()
-                exe.run(startup_program)
-                exe.train_from_dataset(main_program, dataset)
-                dataset.release_memory()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.global_shuffle()
+                >>> exe = paddle.static.Executor(paddle.CPUPlace())
+                >>> startup_program = paddle.static.Program()
+                >>> main_program = paddle.static.Program()
+                >>> exe.run(startup_program)
+                >>> exe.train_from_dataset(main_program, dataset)
+                >>> dataset.release_memory()
 
         """
         self.dataset.release_memory()
@@ -1117,26 +1130,29 @@ def get_memory_data_size(self, fleet=None):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.load_into_memory()
-                print dataset.get_memory_data_size()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> print(dataset.get_memory_data_size())
 
         """
         import numpy as np
@@ -1171,28 +1187,31 @@ def get_shuffle_data_size(self, fleet=None):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                dataset = paddle.distributed.InMemoryDataset()
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.load_into_memory()
-                dataset.global_shuffle()
-                print dataset.get_shuffle_data_size()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.global_shuffle()
+                >>> print(dataset.get_shuffle_data_size())
 
         """
         import numpy as np
@@ -1245,27 +1264,28 @@ def slots_shuffle(self, slots):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-
-                dataset = paddle.distributed.InMemoryDataset()
-                dataset._init_distributed_settings(fea_eval=True)
-                slots = ["slot1", "slot2", "slot3", "slot4"]
-                slots_vars = []
-                for slot in slots:
-                    var = paddle.static.data(
-                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
-                    slots_vars.append(var)
-                dataset.init(
-                    batch_size=1,
-                    thread_num=2,
-                    input_type=1,
-                    pipe_command="cat",
-                    use_var=slots_vars)
-                filelist = ["a.txt", "b.txt"]
-                dataset.set_filelist(filelist)
-                dataset.load_into_memory()
-                dataset.slots_shuffle(['slot1'])
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> paddle.enable_static()
+
+                >>> dataset = paddle.distributed.InMemoryDataset()
+                >>> dataset._init_distributed_settings(fea_eval=True)
+                >>> slots = ["slot1", "slot2", "slot3", "slot4"]
+                >>> slots_vars = []
+                >>> for slot in slots:
+                ...     var = paddle.static.data(
+                ...         name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                ...     slots_vars.append(var)
+                >>> dataset.init(
+                ...     batch_size=1,
+                ...     thread_num=2,
+                ...     input_type=1,
+                ...     pipe_command="cat",
+                ...     use_var=slots_vars)
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.slots_shuffle(['slot1'])
         """
         if self.fea_eval:
             slots_set = set(slots)
@@ -1325,8 +1345,8 @@ class FileInstantDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle
-          dataset = paddle.distributed.fleet.FileInstantDataset()
+            >>> import paddle
+            >>> dataset = paddle.distributed.fleet.FileInstantDataset()
     """
 
     def __init__(self):
@@ -1350,8 +1370,8 @@ class BoxPSDataset(InMemoryDataset):
     Examples:
         .. code-block:: python
 
-          import paddle
-          dataset = paddle.distributed.fleet.BoxPSDataset()
+            >>> import paddle
+            >>> dataset = paddle.distributed.fleet.BoxPSDataset()
     """
 
     def __init__(self):
@@ -1386,9 +1406,9 @@ def _set_rank_offset(self, rank_offset):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              dataset._set_rank_offset("rank_offset")
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> dataset._set_rank_offset("rank_offset")
 
         Args:
             rank_offset(str): rank_offset's name
@@ -1403,9 +1423,9 @@ def _set_pv_batch_size(self, pv_batch_size):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              dataset._set_pv_batch_size(128)
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> dataset._set_pv_batch_size(128)
         Args:
             pv_batch_size(int): pv batch size
 
@@ -1422,9 +1442,9 @@ def _set_parse_logkey(self, parse_logkey):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              dataset._set_parse_logkey(True)
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> dataset._set_parse_logkey(True)
 
         """
         self.parse_logkey = parse_logkey
@@ -1439,9 +1459,9 @@ def _set_merge_by_sid(self, merge_by_sid):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              dataset._set_merge_by_sid(True)
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> dataset._set_merge_by_sid(True)
 
         """
         self.merge_by_sid = merge_by_sid
@@ -1456,9 +1476,9 @@ def _set_enable_pv_merge(self, enable_pv_merge):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              dataset._set_enable_pv_merge(True)
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> dataset._set_enable_pv_merge(True)
 
         """
         self.enable_pv_merge = enable_pv_merge
@@ -1480,9 +1500,9 @@ def begin_pass(self):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              dataset.begin_pass()
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> dataset.begin_pass()
         """
         self.boxps.begin_pass()
 
@@ -1493,9 +1513,9 @@ def end_pass(self, need_save_delta):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              dataset.end_pass(True)
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> dataset.end_pass(True)
         """
         self.boxps.end_pass(need_save_delta)
 
@@ -1506,12 +1526,13 @@ def wait_preload_done(self):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.preload_into_memory()
-              dataset.wait_preload_done()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.preload_into_memory()
+                >>> dataset.wait_preload_done()
         """
         self.boxps.wait_feed_pass_done()
 
@@ -1521,11 +1542,12 @@ def load_into_memory(self):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
         """
         self._prepare_to_run()
         self.boxps.load_into_memory()
@@ -1536,11 +1558,12 @@ def preload_into_memory(self):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.preload_into_memory()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.preload_into_memory()
         """
         self._prepare_to_run()
         self.boxps.preload_into_memory()
@@ -1582,12 +1605,13 @@ def set_current_phase(self, current_phase):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.set_current_phase(1)
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.set_current_phase(1)
 
         """
         self.dataset.set_current_phase(current_phase)
@@ -1606,12 +1630,13 @@ def get_pv_data_size(self):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              print dataset.get_pv_data_size()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> print(dataset.get_pv_data_size())
 
         """
         return self.dataset.get_pv_data_size()
@@ -1624,12 +1649,13 @@ def preprocess_instance(self):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.preprocess_instance()
 
         """
         self.dataset.preprocess_instance()
@@ -1641,14 +1667,15 @@ def postprocess_instance(self):
         Examples:
             .. code-block:: python
 
-              import paddle
-              dataset = paddle.distributed.fleet.BoxPSDataset()
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
-              exe.train_from_dataset(dataset)
-              dataset.postprocess_instance()
+                >>> # doctest: +SKIP('No files to read')
+                >>> import paddle
+                >>> dataset = paddle.distributed.fleet.BoxPSDataset()
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.preprocess_instance()
+                >>> exe.train_from_dataset(dataset)
+                >>> dataset.postprocess_instance()
 
         """
         self.dataset.postprocess_instance()
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
index 67b88cb52ab45..c3ebb10c32140 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -61,36 +61,34 @@ class VocabParallelEmbedding(paddle.nn.Layer):
 
     Examples:
         .. code-block:: python
-        import paddle
-        from paddle.distributed import fleet
-
-        class SimpleMPNet(paddle.nn.Layer):
-           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
-              super().__init__()
-              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
-                    hidden_size,
-                    inner_size,
-                    gather_output=False,
-                    has_bias=True)
-
-              self.linear2 = fleet.meta_parallel.RowParallelLinear(
-                    inner_size,
-                    hidden_size,
-                    input_is_parallel=True,
-                    has_bias=True)
-
-              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
-
-              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
-                                vocab_size,
-                                hidden_size)
-
-           def forward(self, x):
-              x = self.embedding(x)
-              x = self.linear1(x)
-              x = self.linear2(x)
-              x = self.linear3(x)
-              return x
+
+            >>> import paddle
+            >>> from paddle.distributed import fleet
+
+            >>> class SimpleMPNet(paddle.nn.Layer):
+            ...     def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+            ...         super().__init__()
+            ...         self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+            ...             hidden_size,
+            ...             inner_size,
+            ...             gather_output=False,
+            ...             has_bias=True)
+            ...         self.linear2 = fleet.meta_parallel.RowParallelLinear(
+            ...             inner_size,
+            ...             hidden_size,
+            ...             input_is_parallel=True,
+            ...             has_bias=True)
+            ...         self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+            ...         self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+            ...                         vocab_size,
+            ...                         hidden_size)
+            ...     def forward(self, x):
+            ...         x = self.embedding(x)
+            ...         x = self.linear1(x)
+            ...         x = self.linear2(x)
+            ...         x = self.linear3(x)
+            ...         return x
+
     """
 
     def __init__(
@@ -327,36 +325,33 @@ class ColumnParallelLinear(paddle.nn.Layer):
 
     Examples:
         .. code-block:: python
-        import paddle
-        from paddle.distributed import fleet
-
-        class SimpleMPNet(paddle.nn.Layer):
-           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
-              super().__init__()
-              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
-                    hidden_size,
-                    inner_size,
-                    gather_output=False,
-                    has_bias=True)
-
-              self.linear2 = fleet.meta_parallel.RowParallelLinear(
-                    inner_size,
-                    hidden_size,
-                    input_is_parallel=True,
-                    has_bias=True)
-
-              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
-
-              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
-                                vocab_size,
-                                hidden_size)
-
-           def forward(self, x):
-              x = self.embedding(x)
-              x = self.linear1(x)
-              x = self.linear2(x)
-              x = self.linear3(x)
-              return x
+
+            >>> import paddle
+            >>> from paddle.distributed import fleet
+
+            >>> class SimpleMPNet(paddle.nn.Layer):
+            ...     def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+            ...         super().__init__()
+            ...         self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+            ...             hidden_size,
+            ...             inner_size,
+            ...             gather_output=False,
+            ...             has_bias=True)
+            ...         self.linear2 = fleet.meta_parallel.RowParallelLinear(
+            ...             inner_size,
+            ...             hidden_size,
+            ...             input_is_parallel=True,
+            ...             has_bias=True)
+            ...         self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+            ...         self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+            ...                         vocab_size,
+            ...                         hidden_size)
+            ...     def forward(self, x):
+            ...         x = self.embedding(x)
+            ...         x = self.linear1(x)
+            ...         x = self.linear2(x)
+            ...         x = self.linear3(x)
+            ...         return x
     """
 
     def __init__(
@@ -537,36 +532,34 @@ class RowParallelLinear(paddle.nn.Layer):
 
     Examples:
         .. code-block:: python
-        import paddle
-        from paddle.distributed import fleet
-
-        class SimpleMPNet(paddle.nn.Layer):
-           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
-              super().__init__()
-              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
-                    hidden_size,
-                    inner_size,
-                    gather_output=False,
-                    has_bias=True)
-
-              self.linear2 = fleet.meta_parallel.RowParallelLinear(
-                    inner_size,
-                    hidden_size,
-                    input_is_parallel=True,
-                    has_bias=True)
-
-              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
-
-              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
-                                vocab_size,
-                                hidden_size)
-
-           def forward(self, x):
-              x = self.embedding(x)
-              x = self.linear1(x)
-              x = self.linear2(x)
-              x = self.linear3(x)
-              return x
+
+            >>> import paddle
+            >>> from paddle.distributed import fleet
+
+            >>> class SimpleMPNet(paddle.nn.Layer):
+            ...     def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+            ...         super().__init__()
+            ...         self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+            ...             hidden_size,
+            ...             inner_size,
+            ...             gather_output=False,
+            ...             has_bias=True)
+            ...         self.linear2 = fleet.meta_parallel.RowParallelLinear(
+            ...             inner_size,
+            ...             hidden_size,
+            ...             input_is_parallel=True,
+            ...             has_bias=True)
+            ...         self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+            ...         self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+            ...                         vocab_size,
+            ...                         hidden_size)
+            ...     def forward(self, x):
+            ...         x = self.embedding(x)
+            ...         x = self.linear1(x)
+            ...         x = self.linear2(x)
+            ...         x = self.linear3(x)
+            ...         return x
+
     """
 
     def __init__(
@@ -736,8 +729,12 @@ class ParallelCrossEntropy(paddle.nn.Layer):
 
     Examples:
         .. code-block:: python
-        loss_func = ParallelCrossEntropy()
-        loss = loss_func(img, lable)
+
+            >>> # doctest: +SKIP('No img to demonstrate')
+            >>> from paddle.distributed.fleet.layers.mpu import ParallelCrossEntropy
+            >>> loss_func = ParallelCrossEntropy
+            >>> loss = loss_func(img, lable)
+
     """
 
     def __init__(self, mp_group=None, name=None, ignore_index=-100):
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
index 5a726dd5ab141..360c186103565 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -809,19 +809,19 @@ def split(
     Examples:
         .. code-block:: python
 
-            # required: distributed
-            import paddle
-            import paddle.distributed.fleet as fleet
-
-            paddle.enable_static()
-            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
-            fleet.init(is_collective=True)
-            data = paddle.randint(0, 8, shape=[10,4])
-            emb_out = paddle.distributed.split(
-                data,
-                (8, 8),
-                operation="embedding",
-                num_partitions=2)
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> import paddle.distributed.fleet as fleet
+
+            >>> paddle.enable_static()
+            >>> paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            >>> fleet.init(is_collective=True)
+            >>> data = paddle.randint(0, 8, shape=[10,4])
+            >>> emb_out = paddle.distributed.split(
+            ...     data,
+            ...     (8, 8),
+            ...     operation="embedding",
+            ...     num_partitions=2)
 
     """
     assert isinstance(size, (list, tuple)), (
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index 4222d80a4e374..26b0c7a12ace7 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -247,68 +247,69 @@ class PipelineLayer(nn.Layer):
         num_virtual_pipeline_stages(int, optional): the num of virtual pipeline stages for interleave pp.
     Examples:
         .. code-block:: python
-        import paddle.nn as nn
-        import paddle.nn.functional as F
-        from paddle.distributed import fleet
-        from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-
-        pipeline_parallel_size = 2
-        strategy = fleet.DistributedStrategy()
-        strategy.hybrid_configs = {
-            "dp_degree": 1,
-            "mp_degree": 1,
-            "pp_degree": pipeline_parallel_size
-        }
-        strategy.pipeline_configs = {
-            "accumulate_steps": 4,
-            "micro_batch_size": 2
-        }
-
-        fleet.init(is_collective=True, strategy=strategy)
-
-        hcg = fleet.get_hybrid_communicate_group()
-
-        class ReshapeHelp(nn.Layer):
-            def __init__(self, shape):
-                super().__init__()
-                self.shape = shape
-
-            def forward(self, x):
-                return x.reshape(shape=self.shape)
-
-        class AlexNetPipeDesc(PipelineLayer):
-            def __init__(self, num_classes=10, **kwargs):
-                self.num_classes = num_classes
-                decs = [
-                    LayerDesc(
-                        nn.Conv2D, 1, 64, kernel_size=11, stride=4, padding=5),
-                    LayerDesc(nn.ReLU),
-                    LayerDesc(
-                        nn.MaxPool2D, kernel_size=2, stride=2),
-                    LayerDesc(
-                        nn.Conv2D, 64, 192, kernel_size=5, padding=2),
-                    F.relu,
-                    LayerDesc(
-                        nn.MaxPool2D, kernel_size=2, stride=2),
-                    LayerDesc(
-                        nn.Conv2D, 192, 384, kernel_size=3, padding=1),
-                    F.relu,
-                    LayerDesc(
-                        nn.Conv2D, 384, 256, kernel_size=3, padding=1),
-                    F.relu,
-                    LayerDesc(
-                        nn.Conv2D, 256, 256, kernel_size=3, padding=1),
-                    F.relu,
-                    LayerDesc(
-                        nn.MaxPool2D, kernel_size=2, stride=2),
-                    LayerDesc(
-                        ReshapeHelp, shape=[-1, 256]),
-                    LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
-                ]
-                super().__init__(
-                    layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
 
-        model = AlexNetPipeDesc(num_stages=pipeline_parallel_size, topology=hcg._topo)
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle.nn as nn
+            >>> import paddle.nn.functional as F
+            >>> from paddle.distributed import fleet
+            >>> from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+
+            >>> pipeline_parallel_size = 2
+            >>> strategy = fleet.DistributedStrategy()
+            >>> strategy.hybrid_configs = {
+            ...     "dp_degree": 1,
+            ...     "mp_degree": 1,
+            ...     "pp_degree": pipeline_parallel_size
+            >>> }
+            >>> strategy.pipeline_configs = {
+            ...     "accumulate_steps": 4,
+            ...     "micro_batch_size": 2
+            >>> }
+
+            >>> fleet.init(is_collective=True, strategy=strategy)
+
+            >>> hcg = fleet.get_hybrid_communicate_group()
+
+            >>> class ReshapeHelp(nn.Layer):
+            ...     def __init__(self, shape):
+            ...         super().__init__()
+            ...         self.shape = shape
+            ...     def forward(self, x):
+            ...         return x.reshape(shape=self.shape)
+
+            >>> class AlexNetPipeDesc(PipelineLayer):
+            ...     def __init__(self, num_classes=10, **kwargs):
+            ...         self.num_classes = num_classes
+            ...         decs = [
+            ...             LayerDesc(
+            ...                 nn.Conv2D, 1, 64, kernel_size=11, stride=4, padding=5),
+            ...             LayerDesc(nn.ReLU),
+            ...             LayerDesc(
+            ...                 nn.MaxPool2D, kernel_size=2, stride=2),
+            ...             LayerDesc(
+            ...                 nn.Conv2D, 64, 192, kernel_size=5, padding=2),
+            ...             F.relu,
+            ...             LayerDesc(
+            ...                 nn.MaxPool2D, kernel_size=2, stride=2),
+            ...             LayerDesc(
+            ...                 nn.Conv2D, 192, 384, kernel_size=3, padding=1),
+            ...             F.relu,
+            ...             LayerDesc(
+            ...                 nn.Conv2D, 384, 256, kernel_size=3, padding=1),
+            ...             F.relu,
+            ...             LayerDesc(
+            ...                 nn.Conv2D, 256, 256, kernel_size=3, padding=1),
+            ...             F.relu,
+            ...             LayerDesc(
+            ...                 nn.MaxPool2D, kernel_size=2, stride=2),
+            ...             LayerDesc(
+            ...                 ReshapeHelp, shape=[-1, 256]),
+            ...             LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
+            ...         ]
+            ...         super().__init__(
+            ...             layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+
+            >>> model = AlexNetPipeDesc(num_stages=pipeline_parallel_size, topology=hcg._topo)
 
     """
 
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index 0d744d17cdd4a..746cb4d817aa1 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -37,16 +37,17 @@ def sum(input, scope=None, util=None):
     Example:
         .. code-block:: python
 
-          # in model.py
-          input = paddle.cast(some_input, dtype='float32')
-          cnt = paddle.sum(input)
-          global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0)
-          tmp = paddle.add(cnt, global_cnt)
-          paddle.assign(tmp, global_cnt)
-
-          # in train.py, after train or infer
-          res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("sum array: ", paddle.distributed.fleet.sum(res))
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # in model.py
+            >>> input = paddle.cast(some_input, dtype='float32')
+            >>> cnt = paddle.sum(input)
+            >>> global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0)
+            >>> tmp = paddle.add(cnt, global_cnt)
+            >>> paddle.assign(tmp, global_cnt)
+
+            >>> # in train.py, after train or infer
+            >>> res = np.array(scope.find_var(global_cnt.name).get_tensor())
+            >>> print("sum array: ", paddle.distributed.fleet.sum(res))
     """
     if scope is None:
         scope = paddle.static.global_scope()
@@ -77,16 +78,17 @@ def max(input, scope=None, util=None):
     Example:
         .. code-block:: python
 
-          # in model.py
-          input = paddle.cast(some_input, dtype='float32')
-          cnt = paddle.sum(input)
-          global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0)
-          tmp = paddle.maximum(cnt, global_cnt)
-          paddle.assign(tmp, global_cnt)
-
-          # in train.py, after train or infer
-          res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("max array: ", paddle.distributed.fleet.max(res))
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # in model.py
+            >>> input = paddle.cast(some_input, dtype='float32')
+            >>> cnt = paddle.sum(input)
+            >>> global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0)
+            >>> tmp = paddle.maximum(cnt, global_cnt)
+            >>> paddle.assign(tmp, global_cnt)
+
+            >>> # in train.py, after train or infer
+            >>> res = np.array(scope.find_var(global_cnt.name).get_tensor())
+            >>> print("max array: ", paddle.distributed.fleet.max(res))
     """
     if scope is None:
         scope = paddle.static.global_scope()
@@ -117,16 +119,17 @@ def min(input, scope=None, util=None):
     Example:
         .. code-block:: python
 
-          # in model.py
-          input = paddle.cast(some_input, dtype='float32')
-          cnt = paddle.sum(input)
-          global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0)
-          tmp = paddle.minimum(cnt, global_cnt)
-          paddle.assign(tmp, global_cnt)
-
-          # in train.py, after train or infer
-          res = np.array(scope.find_var(global_cnt.name).get_tensor())
-          print("min array: ", paddle.distributed.fleet.min(res))
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # in model.py
+            >>> input = paddle.cast(some_input, dtype='float32')
+            >>> cnt = paddle.sum(input)
+            >>> global_cnt = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[], value=0)
+            >>> tmp = paddle.minimum(cnt, global_cnt)
+            >>> paddle.assign(tmp, global_cnt)
+
+            >>> # in train.py, after train or infer
+            >>> res = np.array(scope.find_var(global_cnt.name).get_tensor())
+            >>> print("min array: ", paddle.distributed.fleet.min(res))
     """
     if scope is None:
         scope = paddle.static.global_scope()
@@ -158,17 +161,18 @@ def auc(stat_pos, stat_neg, scope=None, util=None):
     Example:
         .. code-block:: python
 
-          # in model.py
-          similarity_norm = paddle.nn.functional.sigmoid(paddle.clip(output, min=-15.0, max=15.0))
-          binary_predict = paddle.concat(
-              input=[paddle.subtract(paddle.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
-          self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
-              paddle.static.auc(input=binary_predict, label=label, curve='ROC', num_thresholds=4096)
-
-          # in train.py, after train or infer
-          pos = np.array(scope.find_var(stat_pos.name).get_tensor())
-          neg = np.array(scope.find_var(stat_neg.name).get_tensor())
-          print("auc: ", paddle.distributed.fleet.auc(pos, neg))
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # in model.py
+            >>> similarity_norm = paddle.nn.functional.sigmoid(paddle.clip(output, min=-15.0, max=15.0))
+            >>> binary_predict = paddle.concat(
+            ...     input=[paddle.subtract(paddle.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
+            >>> self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
+            ...     paddle.static.auc(input=binary_predict, label=label, curve='ROC', num_thresholds=4096)
+
+            >>> # in train.py, after train or infer
+            >>> pos = np.array(scope.find_var(stat_pos.name).get_tensor())
+            >>> neg = np.array(scope.find_var(stat_neg.name).get_tensor())
+            >>> print("auc: ", paddle.distributed.fleet.auc(pos, neg))
     """
     if scope is None:
         scope = paddle.static.global_scope()
@@ -241,12 +245,13 @@ def mae(abserr, total_ins_num, scope=None, util=None):
     Example:
         .. code-block:: python
 
-          # in model.py
-          sqrerr, abserr, prob, q, pos, total = paddle.static.ctr_metric_bundle(similarity_norm, paddle.cast(x=label, dtype='float32'))
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # in model.py
+            >>> sqrerr, abserr, prob, q, pos, total = paddle.static.ctr_metric_bundle(similarity_norm, paddle.cast(x=label, dtype='float32'))
 
-          # in train.py, after train or infer
-          res = np.array(scope.find_var(abserr.name).get_tensor())
-          print("mae: ", paddle.distributed.fleet.mae(res, total_ins_num))
+            >>> # in train.py, after train or infer
+            >>> res = np.array(scope.find_var(abserr.name).get_tensor())
+            >>> print("mae: ", paddle.distributed.fleet.mae(res, total_ins_num))
     """
     if scope is None:
         scope = paddle.static.global_scope()
@@ -291,12 +296,13 @@ def rmse(sqrerr, total_ins_num, scope=None, util=None):
     Example:
         .. code-block:: python
 
-          # in model.py
-          sqrerr, abserr, prob, q, pos, total = paddle.static.ctr_metric_bundle(similarity_norm, paddle.cast(x=label, dtype='float32'))
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # in model.py
+            >>> sqrerr, abserr, prob, q, pos, total = paddle.static.ctr_metric_bundle(similarity_norm, paddle.cast(x=label, dtype='float32'))
 
-          # in train.py, after train or infer
-          res = np.array(scope.find_var(sqrerr.name).get_tensor())
-          print("rmse: ", paddle.distributed.fleet.rmse(res, total_ins_num))
+            >>> # in train.py, after train or infer
+            >>> res = np.array(scope.find_var(sqrerr.name).get_tensor())
+            >>> print("rmse: ", paddle.distributed.fleet.rmse(res, total_ins_num))
     """
     if scope is None:
         scope = paddle.static.global_scope()
@@ -341,12 +347,13 @@ def mse(sqrerr, total_ins_num, scope=None, util=None):
     Example:
         .. code-block:: python
 
-          # in model.py
-          sqrerr, abserr, prob, q, pos, total = paddle.static.ctr_metric_bundle(similarity_norm, paddle.cast(x=label, dtype='float32'))
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # in model.py
+            >>> sqrerr, abserr, prob, q, pos, total = paddle.static.ctr_metric_bundle(similarity_norm, paddle.cast(x=label, dtype='float32'))
 
-          # in train.py, after train or infer
-          metric = np.array(scope.find_var(sqrerr.name).get_tensor())
-          print("mse: ", paddle.distributed.fleet.mse(metric, total_ins_num))
+            >>> # in train.py, after train or infer
+            >>> metric = np.array(scope.find_var(sqrerr.name).get_tensor())
+            >>> print("mse: ", paddle.distributed.fleet.mse(metric, total_ins_num))
     """
     if scope is None:
         scope = paddle.static.global_scope()
@@ -390,23 +397,24 @@ def acc(correct, total, scope=None, util=None):
     Example:
         .. code-block:: python
 
-          # in model.py
-          correct = paddle.static.create_global_var(dtype='float32', shape=[1], value=0)
-          total = paddle.static.create_global_var(dtype='float32', shape=[1], value=0)
-          acc = paddle.metric.accuracy(predict, label, k=1, correct=correct, total=total)
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # in model.py
+            >>> correct = paddle.static.create_global_var(dtype='float32', shape=[1], value=0)
+            >>> total = paddle.static.create_global_var(dtype='float32', shape=[1], value=0)
+            >>> acc = paddle.metric.accuracy(predict, label, k=1, correct=correct, total=total)
 
-          global_correct = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
-          tmp1 = paddle.minimum(correct, global_correct)
-          paddle.assign(tmp1, global_correct)
+            >>> global_correct = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
+            >>> tmp1 = paddle.minimum(correct, global_correct)
+            >>> paddle.assign(tmp1, global_correct)
 
-          global_total = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
-          tmp2 = paddle.minimum(total, global_total)
-          paddle.assign(tmp2, global_total)
+            >>> global_total = paddle.static.create_global_var(persistable=True, dtype='float32', shape=[1], value=0)
+            >>> tmp2 = paddle.minimum(total, global_total)
+            >>> paddle.assign(tmp2, global_total)
 
-          # in train.py, after train or infer
-          correct_num = np.array(scope.find_var(correct.name).get_tensor())
-          total_num = np.array(scope.find_var(total.name).get_tensor())
-          print("accuracy: ", paddle.distributed.fleet.acc(correct_num, total_num))
+            >>> # in train.py, after train or infer
+            >>> correct_num = np.array(scope.find_var(correct.name).get_tensor())
+            >>> total_num = np.array(scope.find_var(total.name).get_tensor())
+            >>> print("accuracy: ", paddle.distributed.fleet.acc(correct_num, total_num))
     """
     if scope is None:
         scope = paddle.static.global_scope()

From aca219475dfcb0b23325378375c859fb5d3ff7d4 Mon Sep 17 00:00:00 2001
From: Android zhang <53324261+zade23@users.noreply.github.com>
Date: Mon, 25 Sep 2023 10:44:55 +0800
Subject: [PATCH 090/115]  [Docathon] Fix NO.8-NO.11 API label (#57614)

---
 python/paddle/base/layers/math_op_patch.py        | 2 +-
 python/paddle/incubate/optimizer/lars_momentum.py | 4 ++--
 python/paddle/incubate/optimizer/lbfgs.py         | 4 ++--
 python/paddle/nn/clip.py                          | 8 ++++----
 python/paddle/optimizer/adadelta.py               | 4 ++--
 python/paddle/optimizer/adam.py                   | 4 ++--
 python/paddle/optimizer/adamax.py                 | 4 ++--
 python/paddle/optimizer/adamw.py                  | 4 ++--
 python/paddle/optimizer/lbfgs.py                  | 4 ++--
 python/paddle/optimizer/momentum.py               | 4 ++--
 python/paddle/optimizer/optimizer.py              | 4 ++--
 python/paddle/optimizer/rmsprop.py                | 4 ++--
 python/paddle/optimizer/sgd.py                    | 4 ++--
 13 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index cba6b9a3b55de..ba327411264ea 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -241,7 +241,7 @@ def place(self):
     def astype(self, dtype):
         """
         **Notes**:
-            **The variable must be a** :ref:`api_base_Tensor`
+            **The variable must be a** :ref:`api_paddle_Tensor`
 
         Cast a variable to a specified data type.
 
diff --git a/python/paddle/incubate/optimizer/lars_momentum.py b/python/paddle/incubate/optimizer/lars_momentum.py
index 1c6ecc263e6f6..088721b929ee7 100644
--- a/python/paddle/incubate/optimizer/lars_momentum.py
+++ b/python/paddle/incubate/optimizer/lars_momentum.py
@@ -50,8 +50,8 @@ class LarsMomentumOptimizer(Optimizer):
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
         exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
diff --git a/python/paddle/incubate/optimizer/lbfgs.py b/python/paddle/incubate/optimizer/lbfgs.py
index 137b8eb7ccbdc..1e0d959f5ecc5 100644
--- a/python/paddle/incubate/optimizer/lbfgs.py
+++ b/python/paddle/incubate/optimizer/lbfgs.py
@@ -64,8 +64,8 @@ class LBFGS(Optimizer):
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \
             some derived class of ``GradientClipBase`` . There are three cliping strategies \
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` , \
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index e262401fd15e5..5fda0adff5efa 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -950,16 +950,16 @@ def set_gradient_clip(clip, param_list=None, program=None):
         and it may be removed in future releases, so it is not recommended.
         It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
         this is a better method to clip gradient. There are three clipping strategies:
-         :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-         :ref:`api_base_clip_GradientClipByValue` .
+         :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+         :ref:`api_paddle_nn_ClipGradByValue` .
 
     To specify parameters that require gradient clip.
 
     Args:
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default value: None, and there is no
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default value: None, and there is no
             gradient clipping.
         param_list (list(Variable), optional): Parameters that require gradient clip.
                 It can be a list of parameter or a list of parameter's name.
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index ae8e5d2dc6b26..d2a572fefb91d 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -61,8 +61,8 @@ class Adadelta(Optimizer):
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): The default value is None. Normally there is no need for user
                 to set this property. For more information, please refer to
                 :ref:`api_guide_Name` .
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 12e932c6fb218..a876b23cafac6 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -79,8 +79,8 @@ class Adam(Optimizer):
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
             The accumulators are updated at every step. Every element of the two moving-average
             is updated in both dense mode and sparse mode. If the size of parameter is very large,
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 354c5a9bb531a..68b92819f3680 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -74,8 +74,8 @@ class Adamax(Optimizer):
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three clipping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 6336f1914280f..ae746581cca79 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -77,8 +77,8 @@ class AdamW(Optimizer):
             Default: None.
         grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three clipping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
             The accumulators are updated at every step. Every element of the two moving-average
             is updated in both dense mode and sparse mode. If the size of parameter is very large,
diff --git a/python/paddle/optimizer/lbfgs.py b/python/paddle/optimizer/lbfgs.py
index c2f9cb6b52263..4f36dab76e160 100644
--- a/python/paddle/optimizer/lbfgs.py
+++ b/python/paddle/optimizer/lbfgs.py
@@ -346,8 +346,8 @@ class LBFGS(Optimizer):
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \
             some derived class of ``GradientClipBase`` . There are three cliping strategies \
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` , \
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 5dd0a424778bb..2a8c095fd6adb 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -66,8 +66,8 @@ class Momentum(Optimizer):
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three clipping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
         rescale_grad (float, optional): Multiply the gradient with `rescale_grad` before updating. \
             Often choose to be ``1.0/batch_size``.
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index d529b605d8447..d10adbce4d70a 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -115,8 +115,8 @@ class Optimizer:
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of \
             some derived class of ``GradientClipBase`` . There are three cliping strategies \
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` , \
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` , \
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`.
             The default value is None.
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 07bb27b46e6b3..2ce94f3471173 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -98,8 +98,8 @@ class RMSProp(Optimizer):
           Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
           some derived class of ``GradientClipBase`` . There are three clipping strategies
-          ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-          :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+          ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+          :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information.
           For details, please refer to :ref:`api_guide_Name`. Default is None.
 
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index e0edcbfc0e395..56c2c3ae19eb8 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -47,8 +47,8 @@ class SGD(Optimizer):
             Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient clipping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three clipping strategies
-            ( :ref:`api_base_clip_GradientClipByGlobalNorm` , :ref:`api_base_clip_GradientClipByNorm` ,
-            :ref:`api_base_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+            ( :ref:`api_paddle_nn_ClipGradByGlobalNorm` , :ref:`api_paddle_nn_ClipGradByNorm` ,
+            :ref:`api_paddle_nn_ClipGradByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): The default value is None. Normally there is no need for user
                 to set this property. For more information, please refer to
                 :ref:`api_guide_Name` .

From c0e5e95ac82a908e4c8f73ba1952f35a3d5ae1f8 Mon Sep 17 00:00:00 2001
From: Android zhang <53324261+zade23@users.noreply.github.com>
Date: Mon, 25 Sep 2023 10:46:21 +0800
Subject: [PATCH 091/115] [CodeStyle][task 22] enable Ruff C417 rule in
 `python/paddle/base` (#57634)

* fix Ruff_C417

* remove pyproject.toml/C417
---
 pyproject.toml                          |  1 -
 python/paddle/base/backward.py          | 14 ++------------
 python/paddle/static/nn/control_flow.py |  4 +---
 test/legacy_test/test_lstm_cudnn_op.py  |  2 +-
 test/rnn/rnn_numpy.py                   |  2 +-
 5 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b62b503c06e96..088841222c7d0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -109,7 +109,6 @@ ignore = [
     "C408",
     "UP030",
     "C405",
-    "C417",
     "B004",
     "B009",
     "B016",
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 1c3c3a3f202ed..2bc6f4fd13dcf 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -345,22 +345,12 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
     for para, args in inputs.items():
         op_desc.set_input(
             para,
-            list(
-                map(
-                    lambda arg: arg.decode() if isinstance(arg, bytes) else arg,
-                    args,
-                )
-            ),
+            [arg.decode() if isinstance(arg, bytes) else arg for arg in args],
         )
     for para, args in outputs.items():
         op_desc.set_output(
             para,
-            list(
-                map(
-                    lambda arg: arg.decode() if isinstance(arg, bytes) else arg,
-                    args,
-                )
-            ),
+            [arg.decode() if isinstance(arg, bytes) else arg for arg in args],
         )
     op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
     op_device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
diff --git a/python/paddle/static/nn/control_flow.py b/python/paddle/static/nn/control_flow.py
index 0d0afcc71c150..e5603f733883c 100644
--- a/python/paddle/static/nn/control_flow.py
+++ b/python/paddle/static/nn/control_flow.py
@@ -1422,9 +1422,7 @@ def _select_input_infer_shape(first_shape, second_shape):
             f"the input shapes of select_input should have the same rank, but get {first_shape}, {second_shape}"
         )
         return second_shape
-    out_shape = list(
-        map(lambda a, b: a if a == b else -1, first_shape, second_shape)
-    )
+    out_shape = [a if a == b else -1 for a, b in zip(first_shape, second_shape)]
     return out_shape
 
 
diff --git a/test/legacy_test/test_lstm_cudnn_op.py b/test/legacy_test/test_lstm_cudnn_op.py
index 2d61b7c8f9a2d..ade1f61c0d5a9 100644
--- a/test/legacy_test/test_lstm_cudnn_op.py
+++ b/test/legacy_test/test_lstm_cudnn_op.py
@@ -134,7 +134,7 @@ def update_state(mask, new, old):
     if not isinstance(old, (tuple, list)):
         return np.where(mask, new, old)
     else:
-        return tuple(map(lambda x, y: np.where(mask, x, y), new, old))
+        return tuple(np.where(mask, x, y) for x, y in zip(new, old))
 
 
 def rnn(
diff --git a/test/rnn/rnn_numpy.py b/test/rnn/rnn_numpy.py
index d1a7ccbf02ecb..f303b460cdb9d 100644
--- a/test/rnn/rnn_numpy.py
+++ b/test/rnn/rnn_numpy.py
@@ -211,7 +211,7 @@ def update_state(mask, new, old):
     if not isinstance(old, (tuple, list)):
         return np.where(mask, new, old)
     else:
-        return tuple(map(lambda x, y: np.where(mask, x, y), new, old))
+        return tuple(np.where(mask, x, y) for x, y in zip(new, old))
 
 
 def rnn(

From d10e67073351a4409601dc45541850b623352145 Mon Sep 17 00:00:00 2001
From: megemini <megemini@outlook.com>
Date: Mon, 25 Sep 2023 10:49:32 +0800
Subject: [PATCH 092/115] [xdoctest][task cleanup 1-8] reformat example code
 with google style (#57670)

* [Change] xdoctest part 1

* [Change] fix code-block

* [Fix] fix load_from_prototxt

* Apply suggestions from code review

---------

Co-authored-by: Nyakku Shigure <sigure.qaq@gmail.com>
---
 paddle/fluid/pybind/eager_properties.cc       | 183 +++--
 python/paddle/base/dataset.py                 |   9 +-
 python/paddle/base/framework.py               |  50 +-
 python/paddle/base/reader.py                  | 386 ++++++-----
 python/paddle/decomposition/register.py       |  13 +-
 .../paddle/distributed/auto_parallel/api.py   |  70 +-
 .../fleet/base/distributed_strategy.py        | 643 +++++++++---------
 .../distributed/fleet/base/role_maker.py      |  34 +-
 8 files changed, 724 insertions(+), 664 deletions(-)

diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 517c210830022..0ab91d632e4fc 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -51,12 +51,14 @@ Tensor's name.
 Examples:
     .. code-block:: python
 
-        import paddle
-
-        x = paddle.to_tensor(1.)
-        print(x.name)  # generated_tensor_0
-        x.name = 'test_tensor_name'
-        print(x.name)  # test_tensor_name
+        >>> import paddle
+
+        >>> x = paddle.to_tensor(1.)
+        >>> print(x.name)
+        generated_tensor_0
+        >>> x.name = 'test_tensor_name'
+        >>> print(x.name)
+        test_tensor_name
 )DOC");
 
 PyObject* tensor_properties_get_name(TensorObject* self, void* closure) {
@@ -84,10 +86,11 @@ Tensor's type.
 Examples:
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor(1.)
-        print(x.type) # VarType.LOD_TENSOR
+        >>> x = paddle.to_tensor(1.)
+        >>> print(x.type)
+        VarType.LOD_TENSOR
 )DOC");
 
 PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
@@ -123,20 +126,27 @@ For the Tensor whose stop_gradient is ``False`` , it will be leaf Tensor too if
 Examples:
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
+
+        >>> x = paddle.to_tensor(1.)
+        >>> print(x.is_leaf)
+        True
+
+        >>> x = paddle.to_tensor(1., stop_gradient=True)
+        >>> y = x + 1
+        >>> print(x.is_leaf)
+        True
 
-        x = paddle.to_tensor(1.)
-        print(x.is_leaf) # True
+        >>> print(y.is_leaf)
+        True
 
-        x = paddle.to_tensor(1., stop_gradient=True)
-        y = x + 1
-        print(x.is_leaf) # True
-        print(y.is_leaf) # True
+        >>> x = paddle.to_tensor(1., stop_gradient=False)
+        >>> y = x + 1
+        >>> print(x.is_leaf)
+        True
 
-        x = paddle.to_tensor(1., stop_gradient=False)
-        y = x + 1
-        print(x.is_leaf) # True
-        print(y.is_leaf) # False
+        >>> print(y.is_leaf)
+        False
 )DOC");
 
 PyObject* tensor_properties_is_leaf(TensorObject* self, void* closure) {
@@ -165,12 +175,15 @@ Tensor's stop_gradient.
 Examples:
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
 
-        x = paddle.to_tensor(1.)
-        print(x.stop_gradient) # True
-        x.stop_gradient = False
-        print(x.stop_gradient) # False
+        >>> x = paddle.to_tensor(1.)
+        >>> print(x.stop_gradient)
+        True
+
+        >>> x.stop_gradient = False
+        >>> print(x.stop_gradient)
+        False
 )DOC");
 
 PyObject* tensor_properties_get_stop_gradient(TensorObject* self,
@@ -192,14 +205,25 @@ Tensor's self.
 Examples:
     .. code-block:: python
 
-        import paddle
+        >>> import paddle
+
+        >>> x = paddle.to_tensor(1.)
+        >>> print(x)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+        1.)
 
-        x = paddle.to_tensor(1.)
-        print(x)
-        print(x.data)
-        x.data = paddle.to_tensor(2.)
-        print(x)
-        print(x.data)
+        >>> print(x.data)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+        1.)
+
+        >>> x.data = paddle.to_tensor(2.)
+        >>> print(x)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+        2.)
+
+        >>> print(x.data)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+        2.)
 )DOC");
 PyObject* tensor_properties_get_data(TensorObject* self, void* closure) {
   EAGER_TRY
@@ -234,14 +258,19 @@ Tensor's grad Tensor.
 Examples:
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
 
-      x = paddle.to_tensor(1.0, stop_gradient=False)
-      y = x**2
-      y.backward()
-      print(x.grad)
-      x.grad = paddle.to_tensor(3.0)
-      print(x.grad)
+        >>> x = paddle.to_tensor(1.0, stop_gradient=False)
+        >>> y = x**2
+        >>> y.backward()
+        >>> print(x.grad)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+        2.)
+
+        >>> x.grad = paddle.to_tensor(3.0)
+        >>> print(x.grad)
+        Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+        3.)
 )DOC");
 PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   EAGER_TRY
@@ -320,12 +349,15 @@ Tensor's persistable.
 Examples:
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
+
+        >>> x = paddle.to_tensor(1.0, stop_gradient=False)
+        >>> print(x.persistable)
+        False
 
-      x = paddle.to_tensor(1.0, stop_gradient=False)
-      print(x.persistable) # False
-      x. persistable = True
-      print(x.persistable) # True
+        >>> x. persistable = True
+        >>> print(x.persistable)
+        True
 )DOC");
 
 PyObject* tensor_properties_get_persistable(TensorObject* self, void* closure) {
@@ -356,17 +388,18 @@ Get dist_attr property from shard tensor.
 Examples:
     .. code-block:: python
 
-        import paddle
-        import paddle.distributed as dist
+        >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+        >>> import paddle
+        >>> import paddle.distributed as dist
 
-        mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
-        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
+        >>> mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
+        >>> dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
 
-        a = paddle.to_tensor([[1,2,3],
-                              [5,6,7]])
-        d_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
+        >>> a = paddle.to_tensor([[1,2,3],
+        ...                       [5,6,7]])
+        >>> d_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
 
-        print(d_tensor.dist_attr)
+        >>> print(d_tensor.dist_attr)
 
 )DOC");
 
@@ -421,10 +454,11 @@ Tensor's shape.
 Examples:
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
 
-      x = paddle.to_tensor(1.0, stop_gradient=False)
-      print(x.shape)
+        >>> x = paddle.to_tensor(1.0, stop_gradient=False)
+        >>> print(x.shape)
+        []
 )DOC");
 
 PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
@@ -507,11 +541,12 @@ Tensor's strides.
 Examples:
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
 
-      x = paddle.to_tensor([1, 2, 3])
-      y = x[1]
-      print(y.strides)
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> y = x[1]
+        >>> print(y.strides)
+        []
 )DOC");
 
 PyObject* tensor_properties_get_strides(TensorObject* self, void* closure) {
@@ -544,11 +579,12 @@ The address of the first element relative to the offset of the video memory.
 Examples:
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
 
-      x = paddle.to_tensor([1, 2, 3])
-      y = x[1]
-      print(y.offset)
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> y = x[1]
+        >>> print(y.offset)
+        8
 )DOC");
 PyObject* tensor_properties_get_offset(TensorObject* self, void* closure) {
   EAGER_TRY
@@ -579,10 +615,11 @@ Tensor's memory layout.
 Examples:
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
 
-      x = paddle.to_tensor([1, 2, 3])
-      print(x.layout)
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> print(x.layout)
+        NCHW
 )DOC");
 PyObject* tensor_properties_get_layout(TensorObject* self, void* closure) {
   EAGER_TRY
@@ -613,10 +650,11 @@ The device Tensor's memory locate.
 Examples:
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
 
-      x = paddle.to_tensor([1, 2, 3])
-      print(x.place)
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> print(x.place)
+        Place(cpu)
 )DOC");
 PyObject* tensor_properties_get_place(TensorObject* self, void* closure) {
   EAGER_TRY
@@ -643,10 +681,11 @@ Tensor's data type.
 Examples:
     .. code-block:: python
 
-      import paddle
+        >>> import paddle
 
-      x = paddle.to_tensor([1, 2, 3])
-      print(x.dtype)
+        >>> x = paddle.to_tensor([1, 2, 3])
+        >>> print(x.dtype)
+        paddle.int64
 )DOC");
 PyObject* tensor_properties_get_dtype(TensorObject* self, void* closure) {
   EAGER_TRY
diff --git a/python/paddle/base/dataset.py b/python/paddle/base/dataset.py
index c15f6e8e6e68a..252ac2803be66 100644
--- a/python/paddle/base/dataset.py
+++ b/python/paddle/base/dataset.py
@@ -145,9 +145,9 @@ def set_fea_eval(self, record_candidate_size, fea_eval=True):
         Examples:
             .. code-block:: python
 
-            import paddle.base as base
-            dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_fea_eval(1000000, True)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_fea_eval(1000000, True)
 
         """
         if fea_eval:
@@ -1089,7 +1089,6 @@ def set_graph_config(self, config):
         Examples:
             .. code-block:: python
 
-                >>> # doctest: +SKIP
                 >>> import paddle.base as base
                 >>> from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
                 >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
@@ -1441,7 +1440,7 @@ def slots_shuffle(self, slots):
             .. code-block:: python
 
                 >>> import paddle.base as base
-                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
                 >>> dataset.set_merge_by_lineid()
                 >>> #suppose there is a slot 0
                 >>> dataset.slots_shuffle(['0'])
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 26cfadb44216d..d6ec848283fc8 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -997,10 +997,11 @@ def cuda_pinned_places(device_count=None):
     Examples:
         .. code-block:: python
 
-            import paddle.base as base
-            cuda_pinned_places_cpu_num = base.cuda_pinned_places()
-            # or
-            cuda_pinned_places = base.cuda_pinned_places(1)
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle.base as base
+            >>> cuda_pinned_places_cpu_num = base.cuda_pinned_places()
+            >>> # or
+            >>> cuda_pinned_places = base.cuda_pinned_places(1)
 
     """
     assert core.is_compiled_with_cuda(), "Not compiled with CUDA"
@@ -1929,6 +1930,7 @@ def stop_gradient(self):
         Examples:
             .. code-block:: python
 
+                >>> import paddle
                 >>> import paddle.base as base
                 >>> import numpy as np
 
@@ -1936,18 +1938,18 @@ def stop_gradient(self):
                 ...     value0 = np.arange(26).reshape(2, 13).astype("float32")
                 ...     value1 = np.arange(6).reshape(2, 3).astype("float32")
                 ...     value2 = np.arange(10).reshape(2, 5).astype("float32")
-                ...     linear = base.Linear(13, 5, dtype="float32")
-                ...     linear2 = base.Linear(3, 3, dtype="float32")
+                ...     linear = paddle.nn.Linear(13, 5)
+                ...     linear2 = paddle.nn.Linear(3, 3)
                 ...     a = base.dygraph.to_variable(value0)
                 ...     b = base.dygraph.to_variable(value1)
                 ...     c = base.dygraph.to_variable(value2)
                 ...     out1 = linear(a)
                 ...     out2 = linear2(b)
                 ...     out1.stop_gradient = True
-                ...     out = base.layers.concat(input=[out1, out2, c], axis=1)
+                ...     out = paddle.concat(x=[out1, out2, c], axis=1)
                 ...     out.backward()
                 ...     assert linear.weight.gradient() is None
-                ...     assert (out1.gradient() == 0).all()
+                ...     assert out1.gradient() is None
         """
         return self.desc.stop_gradient()
 
@@ -1994,6 +1996,7 @@ def is_parameter(self):
             .. code-block:: python
 
                 >>> import paddle
+                >>> paddle.enable_static()
                 >>> new_parameter = paddle.static.create_parameter(name="X",
                 ...                                     shape=[10, 23, 48],
                 ...                                     dtype='float32')
@@ -2846,10 +2849,15 @@ class Operator:
     Examples:
         .. code-block:: python
 
-            >>> import paddle.base as base
-            >>> cur_program = base.Program()
+            >>> import paddle
+
+            >>> paddle.enable_static()
+            >>> cur_program = paddle.static.Program()
             >>> cur_block = cur_program.current_block()
-            >>> # var1 += var2 + var3
+            >>> var1 = cur_block.create_var(name="var1", shape=[-1, 23, 48], dtype='float32')
+            >>> var2 = cur_block.create_var(name="var2", shape=[-1, 23, 48], dtype='float32')
+            >>> var3 = cur_block.create_var(name="var3", shape=[-1, 23, 48], dtype='float32')
+            >>> var1 += var2 + var3
             >>> cur_block.append_op(type="sum",
             ...                     inputs={"X": [var1, var2, var3]},
             ...                     outputs={"Out": [var1]})
@@ -3197,9 +3205,10 @@ def _to_readable_code(self, skip_op_callstack=True):
         Examples:
             .. code-block:: python
 
-                >>> import paddle.base as base
+                >>> import paddle
 
-                >>> cur_program = base.Program()
+                >>> paddle.enable_static()
+                >>> cur_program = paddle.static.Program()
                 >>> cur_block = cur_program.current_block()
                 >>> var = cur_block.create_var(name="X",
                 ...                            shape=[-1, 23, 48],
@@ -3928,9 +3937,10 @@ class Block:
     Examples:
         .. code-block:: python
 
-            >>> import paddle.base as base
+            >>> import paddle
 
-            >>> cur_program = base.Program()
+            >>> paddle.enable_static()
+            >>> cur_program = paddle.static.Program()
             >>> cur_block = cur_program.current_block()
             >>> var = cur_block.create_var(name="X",
             ...                            shape=[-1, 23, 48],
@@ -3967,9 +3977,10 @@ def _to_readable_code(self, skip_op_callstack=True):
         Examples:
             .. code-block:: python
 
-                >>> import paddle.base as base
+                >>> import paddle
 
-                >>> cur_program = base.Program()
+                >>> paddle.enable_static()
+                >>> cur_program = paddle.static.Program()
                 >>> cur_block = cur_program.current_block()
                 >>> new_var = cur_block.create_var(name="X",
                 ...                                shape=[-1, 23, 48],
@@ -7278,10 +7289,9 @@ def to_string(self, throw_on_error, with_details=False):
         Examples:
             .. code-block:: python
 
-                >>> import paddle.base as base
                 >>> import paddle
-
-                >>> prog = base.default_main_program()
+                >>> paddle.enable_static()
+                >>> prog = paddle.static.default_main_program()
                 >>> rlt = paddle.static.data("fake_data", shape=[-1,1,1], dtype='float32')
                 >>> debug_str = prog.to_string(throw_on_error=True, with_details=False)
                 >>> print(debug_str)
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index e749e707b65c6..8c2ddd16961da 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -217,199 +217,197 @@ def from_generator(
         Returns:
             loader (DataLoader): the created DataLoader object.
 
-        Examples 1:
-
+        Examples:
             .. code-block:: python
+                :name: example_1
 
-                '''
-                Example in static graph mode
-                '''
-                import numpy as np
-
-                import paddle
-                import paddle.static as static
-                import paddle.nn.functional as F
-
-
-                BATCH_NUM = 10
-                BATCH_SIZE = 16
-                EPOCH_NUM = 4
-
-                CLASS_NUM = 10
-
-                ITERABLE = True # whether the created DataLoader object is iterable
-                USE_GPU = False # whether to use GPU
-
-                DATA_FORMAT = 'batch_generator' # data format of data source user provides
-
-                paddle.enable_static()
-
-                def simple_net(image, label):
-                    fc_tmp = static.nn.fc(image, size=CLASS_NUM)
-                    cross_entropy = F.softmax_with_cross_entropy(image, label)
-                    loss = paddle.mean(cross_entropy)
-                    sgd = paddle.optimizer.SGD(learning_rate=1e-3)
-                    sgd.minimize(loss)
-                    return loss
-
-                def get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
-
-                # If the data generator yields one sample each time,
-                # use DataLoader.set_sample_generator to set the data source.
-                def sample_generator_creator():
-                    def __reader__():
-                        for _ in range(BATCH_NUM * BATCH_SIZE):
-                            image, label = get_random_images_and_labels([784], [1])
-                            yield image, label
-
-                    return __reader__
-
-                # If the data generator yield list of samples each time,
-                # use DataLoader.set_sample_list_generator to set the data source.
-                def sample_list_generator_creator():
-                    def __reader__():
-                        for _ in range(BATCH_NUM):
-                            sample_list = []
-                            for _ in range(BATCH_SIZE):
-                                image, label = get_random_images_and_labels([784], [1])
-                                sample_list.append([image, label])
-
-                            yield sample_list
-
-                    return __reader__
-
-                # If the data generator yields a batch each time,
-                # use DataLoader.set_batch_generator to set the data source.
-                def batch_generator_creator():
-                    def __reader__():
-                        for _ in range(BATCH_NUM):
-                            batch_image, batch_label = get_random_images_and_labels([BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                            yield batch_image, batch_label
-
-                    return __reader__
-
-                # If DataLoader is iterable, use for loop to train the network
-                def train_iterable(exe, prog, loss, loader):
-                    for _ in range(EPOCH_NUM):
-                        for data in loader():
-                            exe.run(prog, feed=data, fetch_list=[loss])
-
-                # If DataLoader is not iterable, use start() and reset() method to control the process
-                def train_non_iterable(exe, prog, loss, loader):
-                    for _ in range(EPOCH_NUM):
-                        loader.start() # call DataLoader.start() before each epoch starts
-                        try:
-                            while True:
-                                exe.run(prog, fetch_list=[loss])
-                        except paddle.core.EOFException:
-                            loader.reset() # call DataLoader.reset() after catching EOFException
-
-                def set_data_source(loader, places):
-                    if DATA_FORMAT == 'sample_generator':
-                        loader.set_sample_generator(sample_generator_creator(), batch_size=BATCH_SIZE, drop_last=True, places=places)
-                    elif DATA_FORMAT == 'sample_list_generator':
-                        loader.set_sample_list_generator(sample_list_generator_creator(), places=places)
-                    elif DATA_FORMAT == 'batch_generator':
-                        loader.set_batch_generator(batch_generator_creator(), places=places)
-                    else:
-                        raise ValueError('Unsupported data format')
-
-                image = static.data(name='image', shape=[None, 784], dtype='float32')
-                label = static.data(name='label', shape=[None, 1], dtype='int64')
-
-                # Define DataLoader
-                loader = paddle.base.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE)
-
-                # Define network
-                loss = simple_net(image, label)
-
-                places = static.cuda_places() if USE_GPU else static.cpu_places()
-                set_data_source(loader, places)
-
-                exe = static.Executor(places[0])
-                exe.run(static.default_startup_program())
-
-                prog = static.CompiledProgram(static.default_main_program())
-                if loader.iterable:
-                    train_iterable(exe, prog, loss, loader)
-                else:
-                    train_non_iterable(exe, prog, loss, loader)
+                >>> # Example in static graph mode
 
+                >>> import numpy as np
 
-        Examples 2:
+                >>> import paddle
+                >>> import paddle.static as static
+                >>> import paddle.nn.functional as F
 
-            .. code-block:: python
 
-                '''
-                Example in dynamic graph mode.
-                '''
-                import numpy as np
+                >>> BATCH_NUM = 10
+                >>> BATCH_SIZE = 16
+                >>> EPOCH_NUM = 4
+
+                >>> CLASS_NUM = 10
+
+                >>> ITERABLE = True # whether the created DataLoader object is iterable
+                >>> USE_GPU = False # whether to use GPU
 
-                import paddle
-                import paddle.nn as nn
-                import paddle.optimizer as opt
-                import paddle.distributed as dist
+                >>> DATA_FORMAT = 'batch_generator' # data format of data source user provides
 
-                BATCH_SIZE = 16
-                BATCH_NUM = 4
-                EPOCH_NUM = 4
+                >>> paddle.enable_static()
 
-                IMAGE_SIZE = 784
-                CLASS_NUM = 10
+                >>> def simple_net(image, label):
+                ...     fc_tmp = static.nn.fc(image, size=CLASS_NUM)
+                ...     cross_entropy = F.softmax_with_cross_entropy(image, label)
+                ...     loss = paddle.mean(cross_entropy)
+                ...     sgd = paddle.optimizer.SGD(learning_rate=1e-3)
+                ...     sgd.minimize(loss)
+                ...     return loss
+                ...
+                >>> def get_random_images_and_labels(image_shape, label_shape):
+                ...     image = np.random.random(size=image_shape).astype('float32')
+                ...     label = np.random.random(size=label_shape).astype('int64')
+                ...     return image, label
+                ...
+                >>> # If the data generator yields one sample each time,
+                >>> # use DataLoader.set_sample_generator to set the data source.
+                >>> def sample_generator_creator():
+                ...     def __reader__():
+                ...         for _ in range(BATCH_NUM * BATCH_SIZE):
+                ...             image, label = get_random_images_and_labels([784], [1])
+                ...             yield image, label
+                ...
+                ...     return __reader__
+                ...
+                >>> # If the data generator yield list of samples each time,
+                >>> # use DataLoader.set_sample_list_generator to set the data source.
+                >>> def sample_list_generator_creator():
+                ...     def __reader__():
+                ...         for _ in range(BATCH_NUM):
+                ...             sample_list = []
+                ...             for _ in range(BATCH_SIZE):
+                ...                 image, label = get_random_images_and_labels([784], [1])
+                ...                 sample_list.append([image, label])
+                ...
+                ...             yield sample_list
+                ...
+                ...     return __reader__
+                ...
+                >>> # If the data generator yields a batch each time,
+                >>> # use DataLoader.set_batch_generator to set the data source.
+                >>> def batch_generator_creator():
+                ...     def __reader__():
+                ...         for _ in range(BATCH_NUM):
+                ...             batch_image, batch_label = get_random_images_and_labels([BATCH_SIZE, 784], [BATCH_SIZE, 1])
+                ...             yield batch_image, batch_label
+                ...
+                ...     return __reader__
+                ...
+                >>> # If DataLoader is iterable, use for loop to train the network
+                >>> def train_iterable(exe, prog, loss, loader):
+                ...     for _ in range(EPOCH_NUM):
+                ...         for data in loader():
+                ...             exe.run(prog, feed=data, fetch_list=[loss])
+                ...
+                >>> # If DataLoader is not iterable, use start() and reset() method to control the process
+                >>> def train_non_iterable(exe, prog, loss, loader):
+                ...     for _ in range(EPOCH_NUM):
+                ...         loader.start() # call DataLoader.start() before each epoch starts
+                ...         try:
+                ...             while True:
+                ...                 exe.run(prog, fetch_list=[loss])
+                ...         except paddle.core.EOFException:
+                ...             loader.reset() # call DataLoader.reset() after catching EOFException
+                ...
+                >>> def set_data_source(loader, places):
+                ...     if DATA_FORMAT == 'sample_generator':
+                ...         loader.set_sample_generator(sample_generator_creator(), batch_size=BATCH_SIZE, drop_last=True, places=places)
+                ...     elif DATA_FORMAT == 'sample_list_generator':
+                ...         loader.set_sample_list_generator(sample_list_generator_creator(), places=places)
+                ...     elif DATA_FORMAT == 'batch_generator':
+                ...         loader.set_batch_generator(batch_generator_creator(), places=places)
+                ...     else:
+                ...         raise ValueError('Unsupported data format')
+                ...
+                >>> image = static.data(name='image', shape=[None, 784], dtype='float32')
+                >>> label = static.data(name='label', shape=[None, 1], dtype='int64')
 
-                USE_GPU = False # whether to use GPU
+                >>> # Define DataLoader
+                >>> loader = paddle.base.io.DataLoader.from_generator(feed_list=[image, label], capacity=16, iterable=ITERABLE)
 
-                def _get_random_images_and_labels(image_shape, label_shape):
-                        image = np.random.random(size=image_shape).astype('float32')
-                        label = np.random.random(size=label_shape).astype('int64')
-                        return image, label
+                >>> # Define network
+                >>> loss = simple_net(image, label)
 
-                def __reader__():
-                        for _ in range(BATCH_NUM):
-                            batch_image, batch_label = _get_random_images_and_labels(
-                                [BATCH_SIZE, IMAGE_SIZE], [BATCH_SIZE, CLASS_NUM])
-                            yield batch_image, batch_label
+                >>> places = static.cuda_places() if USE_GPU else static.cpu_places()
+                >>> set_data_source(loader, places)
 
-                def random_batch_reader():
-                    return __reader__
+                >>> exe = static.Executor(places[0])
+                >>> exe.run(static.default_startup_program())
 
-                class LinearNet(nn.Layer):
-                    def __init__(self):
-                        super().__init__()
-                        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+                >>> prog = static.CompiledProgram(static.default_main_program())
+                >>> if loader.iterable:
+                ...     train_iterable(exe, prog, loss, loader)
+                >>> else:
+                ...     train_non_iterable(exe, prog, loss, loader)
 
-                    @paddle.jit.to_static
-                    def forward(self, x):
-                        return self._linear(x)
+            .. code-block:: python
+                :name: example_2
 
-                # set device
-                paddle.set_device('gpu' if USE_GPU else 'cpu')
+                >>> # Example in dynamic graph mode.
 
-                # create network
-                layer = LinearNet()
-                dp_layer = paddle.DataParallel(layer)
-                loss_fn = nn.CrossEntropyLoss()
-                adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())
+                >>> import numpy as np
 
-                # create data loader
-                loader = paddle.base.io.DataLoader.from_generator(capacity=5)
-                loader.set_batch_generator(random_batch_reader())
+                >>> import paddle
+                >>> import paddle.nn as nn
+                >>> import paddle.optimizer as opt
+                >>> import paddle.distributed as dist
 
-                for epoch_id in range(EPOCH_NUM):
-                    for batch_id, (image, label) in enumerate(loader()):
-                        out = layer(image)
-                        loss = loss_fn(out, label)
+                >>> BATCH_SIZE = 16
+                >>> BATCH_NUM = 4
+                >>> EPOCH_NUM = 4
 
-                        loss.backward()
+                >>> IMAGE_SIZE = 784
+                >>> CLASS_NUM = 10
 
-                        adam.step()
-                        adam.clear_grad()
-                        print("Epoch {} batch {}: loss = {}".format(
-                            epoch_id, batch_id, np.mean(loss.numpy())))
+                >>> USE_GPU = False # whether to use GPU
 
+                >>> def _get_random_images_and_labels(image_shape):
+                ...         image = np.random.random(size=image_shape).astype('float32')
+                ...         label = np.random.randint(0, CLASS_NUM, size=BATCH_SIZE).astype('int64')
+                ...         return image, label
+                ...
+                >>> def __reader__():
+                ...         for _ in range(BATCH_NUM):
+                ...             batch_image, batch_label = _get_random_images_and_labels(
+                ...                 [BATCH_SIZE, IMAGE_SIZE])
+                ...             yield batch_image, batch_label
+                ...
+                >>> def random_batch_reader():
+                ...     return __reader__
+                ...
+                >>> class LinearNet(nn.Layer):
+                ...     def __init__(self):
+                ...         super().__init__()
+                ...         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+                ...
+                ...     @paddle.jit.to_static
+                ...     def forward(self, x):
+                ...         return self._linear(x)
+                ...
+                >>> # set device
+                >>> paddle.set_device('gpu' if USE_GPU else 'cpu')
+
+                >>> # doctest: +SKIP('`paddle.jit.to_static` can not run in xdoctest')
+                >>> # create network
+                >>> layer = LinearNet()
+                >>> dp_layer = paddle.DataParallel(layer)
+                >>> loss_fn = nn.CrossEntropyLoss()
+                >>> adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())
+
+                >>> # create data loader
+                >>> loader = paddle.base.io.DataLoader.from_generator(capacity=5)
+                >>> loader.set_batch_generator(random_batch_reader())
+
+                >>> for epoch_id in range(EPOCH_NUM):
+                ...     for batch_id, (image, label) in enumerate(loader()):
+                ...         out = layer(image)
+                ...         loss = loss_fn(out, label)
+                ...
+                ...         loss.backward()
+                ...
+                ...         adam.step()
+                ...         adam.clear_grad()
+                ...         print("Epoch {} batch {}: loss = {}".format(
+                ...             epoch_id, batch_id, np.mean(loss.numpy())))
+                ...
+                >>> # doctest: -SKIP
         """
         if in_dygraph_mode():
             return DygraphGeneratorLoader(
@@ -1154,6 +1152,7 @@ class PyReader(DataLoaderBase):
            the reader manually.
 
         .. code-block:: python
+            :name: example_1
 
             >>> import paddle
             >>> import paddle.base as base
@@ -1172,7 +1171,7 @@ class PyReader(DataLoaderBase):
             ...         input=predict, label=label,
             ...         reduction='none', use_softmax=False
             ...     )
-            ...
+
             >>> def reader_creator_random_image_and_label(height, width):
             ...     def reader():
             ...         for i in range(ITER_NUM):
@@ -1182,14 +1181,14 @@ class PyReader(DataLoaderBase):
             ...             fake_label = np.ones([1])
             ...             yield fake_image, fake_label
             ...     return reader
-            ...
+
             >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
             >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
 
             >>> reader = base.io.PyReader(feed_list=[image, label],
             ...                             capacity=4,
             ...                             iterable=False)
-            ...
+
             >>> user_defined_reader = reader_creator_random_image_and_label(784, 784)
             >>> reader.decorate_sample_list_generator(
             ...     paddle.batch(user_defined_reader, batch_size=BATCH_SIZE))
@@ -1204,7 +1203,6 @@ class PyReader(DataLoaderBase):
             ...         except base.core.EOFException:
             ...             reader.reset()
             ...             break
-            ...
 
         2. If iterable=True, the created PyReader object is decoupled with
            the program. No operator would be inserted into the program.
@@ -1213,6 +1211,7 @@ class PyReader(DataLoaderBase):
            object into :code:`Executor.run(feed=...)`.
 
         .. code-block:: python
+            :name: example_2
 
             >>> import paddle
             >>> import paddle.base as base
@@ -1231,7 +1230,7 @@ class PyReader(DataLoaderBase):
             ...         input=predict, label=label,
             ...         reduction='none', use_softmax=False
             ...     )
-            ...
+
             >>> def reader_creator_random_image(height, width):
             ...     def reader():
             ...         for i in range(ITER_NUM):
@@ -1239,7 +1238,7 @@ class PyReader(DataLoaderBase):
             ...             fake_label = np.ones([1])
             ...             yield fake_image, fake_label
             ...     return reader
-            ...
+
             >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
             >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
             >>> reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True, return_list=False)
@@ -1248,7 +1247,7 @@ class PyReader(DataLoaderBase):
             >>> reader.decorate_sample_list_generator(
             ...     paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
             ...         base.core.CPUPlace())
-            ...
+
             >>> loss = network(image, label)
             >>> executor = base.Executor(base.CPUPlace())
             >>> executor.run(base.default_startup_program())
@@ -1256,12 +1255,12 @@ class PyReader(DataLoaderBase):
             >>> for _ in range(EPOCH_NUM):
             ...     for data in reader():
             ...         executor.run(feed=data, fetch_list=[loss])
-            ...
 
         3. If return_list=True, the return values would be presented as list instead of dict.
            This is usually used in dygraph mode.
 
         .. code-block:: python
+            :name: example_3
 
             >>> import paddle
             >>> import paddle.base as base
@@ -1276,7 +1275,7 @@ class PyReader(DataLoaderBase):
             ...             yield np.random.uniform(low=0, high=255, size=[height, width]), \
             ...                 np.random.random_integers(low=0, high=9, size=[1])
             ...     return reader
-            ...
+
             >>> place = base.CPUPlace()
             >>> with base.dygraph.guard(place):
             ...     py_reader = base.io.PyReader(capacity=2, return_list=True)
@@ -1333,12 +1332,12 @@ def start(self):
                 >>> def generator():
                 ...     for i in range(5):
                 ...         yield np.random.uniform(low=0, high=255, size=[784, 784]),
-                ...
+
                 >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
                 >>> reader = base.io.PyReader(feed_list=[image], capacity=4, iterable=False)
                 >>> reader.decorate_sample_list_generator(
                 ...     paddle.batch(generator, batch_size=BATCH_SIZE))
-                ...
+
                 >>> executor = base.Executor(base.CPUPlace())
                 >>> executor.run(base.default_startup_program())
                 >>> for i in range(3):
@@ -1349,7 +1348,6 @@ def start(self):
                 ...         except base.core.EOFException:
                 ...             reader.reset()
                 ...             break
-                ...
         '''
         self._loader.start()
 
@@ -1372,12 +1370,12 @@ def reset(self):
                 >>> def generator():
                 ...     for i in range(5):
                 ...         yield np.random.uniform(low=0, high=255, size=[784, 784]),
-                ...
+
                 >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
                 >>> reader = base.io.PyReader(feed_list=[image], capacity=4, iterable=False)
                 >>> reader.decorate_sample_list_generator(
                 ...     paddle.batch(generator, batch_size=BATCH_SIZE))
-                ...
+
                 >>> executor = base.Executor(base.CPUPlace())
                 >>> executor.run(base.default_startup_program())
                 >>> for i in range(3):
@@ -1388,7 +1386,6 @@ def reset(self):
                 ...         except base.core.EOFException:
                 ...             reader.reset()
                 ...             break
-                ...
         '''
         self._loader.reset()
 
@@ -1435,7 +1432,7 @@ def decorate_sample_generator(
                 ...         input=predict, label=label,
                 ...         reduction='none', use_softmax=False
                 ...     )
-                ...
+
                 >>> def random_image_and_label_generator(height, width):
                 ...     def generator():
                 ...         for i in range(ITER_NUM):
@@ -1445,7 +1442,7 @@ def decorate_sample_generator(
                 ...             fake_label = np.array([1])
                 ...             yield fake_image, fake_label
                 ...     return generator
-                ...
+
                 >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
                 >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
                 >>> reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
@@ -1461,7 +1458,6 @@ def decorate_sample_generator(
                 >>> for _ in range(EPOCH_NUM):
                 ...     for data in reader():
                 ...         executor.run(feed=data, fetch_list=[loss])
-                ...
         '''
         self._loader.set_sample_generator(
             sample_generator, batch_size, drop_last, places
@@ -1502,7 +1498,7 @@ def decorate_sample_list_generator(self, reader, places=None):
                 ...         input=predict, label=label,
                 ...         reduction='none', use_softmax=False
                 ...     )
-                ...
+
                 >>> def random_image_and_label_generator(height, width):
                 ...     def generator():
                 ...         for i in range(ITER_NUM):
@@ -1512,7 +1508,7 @@ def decorate_sample_list_generator(self, reader, places=None):
                 ...             fake_label = np.ones([1])
                 ...             yield fake_image, fake_label
                 ...     return generator
-                ...
+
                 >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
                 >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
                 >>> reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
@@ -1521,7 +1517,7 @@ def decorate_sample_list_generator(self, reader, places=None):
                 >>> reader.decorate_sample_list_generator(
                 ...     paddle.batch(user_defined_generator, batch_size=BATCH_SIZE),
                 ...     base.core.CPUPlace())
-                ...
+
                 >>> loss = network(image, label)
                 >>> executor = base.Executor(base.core.CPUPlace())
                 >>> executor.run(base.default_startup_program())
@@ -1529,7 +1525,6 @@ def decorate_sample_list_generator(self, reader, places=None):
                 >>> for _ in range(EPOCH_NUM):
                 ...     for data in reader():
                 ...         executor.run(feed=data, fetch_list=[loss])
-                ...
         '''
         self._loader.set_sample_list_generator(reader, places)
 
@@ -1568,7 +1563,7 @@ def decorate_batch_generator(self, reader, places=None):
                 ...         input=predict, label=label,
                 ...         reduction='none', use_softmax=False
                 ...     )
-                ...
+
                 >>> def random_image_and_label_generator(height, width):
                 ...     def generator():
                 ...         for i in range(ITER_NUM):
@@ -1580,7 +1575,7 @@ def decorate_batch_generator(self, reader, places=None):
                 ...             batch_label = batch_label.astype('int64')
                 ...             yield batch_image, batch_label
                 ...     return generator
-                ...
+
                 >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
                 >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
                 >>> reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
@@ -1595,7 +1590,6 @@ def decorate_batch_generator(self, reader, places=None):
                 >>> for _ in range(EPOCH_NUM):
                 ...     for data in reader():
                 ...         executor.run(feed=data, fetch_list=[loss])
-                ...
         '''
         self._loader.set_batch_generator(reader, places)
 
diff --git a/python/paddle/decomposition/register.py b/python/paddle/decomposition/register.py
index ba8adc54f6562..5d976f2d8e0b3 100644
--- a/python/paddle/decomposition/register.py
+++ b/python/paddle/decomposition/register.py
@@ -50,13 +50,14 @@ def register_decomp(op_type):
 
     Examples:
         .. code-block:: python
-            @register_decomp('softmax')
-            def softmax(x, axis):
-                molecular = exp(x)
-                denominator = broadcast_to(sum(molecular, axis=axis, keepdim=True), x.shape)
-                res = divide(molecular, denominator)
-                return res
 
+            >>> from paddle.decomposition import register
+            >>> @register.register_decomp('softmax')
+            >>> def softmax(x, axis):
+            ...     molecular = exp(x)
+            ...     denominator = broadcast_to(sum(molecular, axis=axis, keepdim=True), x.shape)
+            ...     res = divide(molecular, denominator)
+            ...     return res
     """
     if not isinstance(op_type, str):
         raise TypeError(f'op_type must be str, but got {type(op_type)}.')
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index c62e1ebf0b66b..0865e52cad7a0 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -34,16 +34,16 @@ class DistAttr(core.TensorDistAttr):
         sharding_specs(list[str|None]): The specification describing how to shard the Tensor.
 
     Examples:
+        .. code-block:: python
 
-    .. code-block:: python
+            >>> import paddle
+            >>> import paddle.distributed as dist
 
-        import paddle
-        import paddle.distributed as dist
+            >>> mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=['x', 'y'])
+            >>> dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
 
-        mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
-        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
+            >>> print(dist_attr)
 
-        print(dist_attr)
     """
 
     def __init__(self, mesh, sharding_specs):
@@ -109,22 +109,24 @@ def shard_tensor(
         Tensor: A Tensor constructed from ``data`` with distributed attributes.
 
     Examples:
+        .. code-block:: python
 
-    .. code-block:: python
+            >>> import paddle
+            >>> import paddle.distributed as dist
 
-        import paddle
-        import paddle.distributed as dist
+            >>> mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=['x', 'y'])
+            >>> dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
 
-        mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
-        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
+            >>> # dense tensor
+            >>> a = paddle.to_tensor([[1,2,3],
+            ...                       [5,6,7]])
 
-        # dense tensor
-        a = paddle.to_tensor([[1,2,3],
-                              [5,6,7]])
-        # distributed tensor
-        d_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # distributed tensor
+            >>> d_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
+
+            >>> print(d_tensor)
 
-        print(d_tensor)
     """
     # 1. create dense tensor
     # `paddle.to_tensor` supports both dynamic and static mode
@@ -166,7 +168,6 @@ def dtensor_from_fn(fn, dist_attr, *args, **kwargs):
         Tensor: A Tensor constructed from ``fn`` with distributed attributes.
 
     Examples:
-
         .. code-block:: python
 
             >>> import paddle
@@ -177,6 +178,7 @@ def dtensor_from_fn(fn, dist_attr, *args, **kwargs):
             >>> # Call the function dtensor_from_fn with dist_attr parameter
             >>> d_tensor = dist.dtensor_from_fn(paddle.ones, dist_attr=dist_attr, shape=[1])
             >>> print(d_tensor)
+
     """
     tensor = fn(*args, **kwargs)
     return shard_tensor(tensor, dist_attr=dist_attr)
@@ -194,28 +196,30 @@ def reshard(dist_tensor, dist_attr):
         Tensor: A Distributed Tensor reshared with distributed attributes.
 
     Examples:
+        .. code-block:: python
 
-    .. code-block:: python
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=['x', 'y'])
+            >>> dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
 
-        import paddle
-        import paddle.distributed as dist
+            >>> out_mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=['x', 'y'])
+            >>> out_dist_attr = dist.DistAttr(mesh=out_mesh, sharding_specs=[None, None])
 
-        mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
-        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
+            >>> # dense tensor
+            >>> a = paddle.to_tensor([[1,2,3],
+            ...                       [5,6,7]])
 
-        out_mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
-        out_dist_attr = dist.DistAttr(mesh=out_mesh, sharding_specs=[None, None])
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> # distributed tensor
+            >>> d_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
 
-        # dense tensor
-        a = paddle.to_tensor([[1,2,3],
-                              [5,6,7]])
-        # distributed tensor
-        d_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
+            >>> out_d_tensor = dist.reshard(d_tensor, out_dist_attr)
 
-        out_d_tensor = dist.reshard(d_tensor, out_dist_attr)
+            >>> print(d_tensor)
+            >>> print(out_d_tensor)
 
-        print(d_tensor)
-        print(out_d_tensor)
     """
 
     if paddle.framework.in_dynamic_mode():
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 09e2ef82162bc..4750c6bca66fc 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -174,12 +174,12 @@ def save_to_prototxt(self, output):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.dgc = True
-                strategy.recompute = True
-                strategy.recompute_configs = {"checkpoints": ["x"]}
-                strategy.save_to_prototxt("dist_strategy.prototxt")
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.dgc = True
+                >>> strategy.recompute = True
+                >>> strategy.recompute_configs = {"checkpoints": ["x"]}
+                >>> strategy.save_to_prototxt("dist_strategy.prototxt")
 
         """
         with open(output, "w") as fout:
@@ -193,9 +193,14 @@ def load_from_prototxt(self, pb_file):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.load_from_prototxt("dist_strategy.prototxt")
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.dgc = True
+                >>> strategy.recompute = True
+                >>> strategy.recompute_configs = {"checkpoints": ["x"]}
+                >>> strategy.save_to_prototxt("dist_strategy.prototxt")
+
+                >>> strategy.load_from_prototxt("dist_strategy.prototxt")
 
         """
         with open(pb_file, 'r') as f:
@@ -211,14 +216,14 @@ def execution_strategy(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                exe_strategy = paddle.static.ExecutionStrategy()
-                exe_strategy.num_threads = 10
-                exe_strategy.num_iteration_per_drop_scope = 10
-                exe_strategy.num_iteration_per_run = 10
+                >>> import paddle
+                >>> exe_strategy = paddle.static.ExecutionStrategy()
+                >>> exe_strategy.num_threads = 10
+                >>> exe_strategy.num_iteration_per_drop_scope = 10
+                >>> exe_strategy.num_iteration_per_run = 10
 
-                strategy = paddle.distributed.fleet.DistributedStrategy()
-                strategy.execution_strategy = exe_strategy
+                >>> strategy = paddle.distributed.fleet.DistributedStrategy()
+                >>> strategy.execution_strategy = exe_strategy
 
         """
         execution_strategy = paddle.static.ExecutionStrategy()
@@ -253,19 +258,19 @@ def build_strategy(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                build_strategy = paddle.static.BuildStrategy()
-                build_strategy.enable_sequential_execution = True
-                build_strategy.fuse_elewise_add_act_ops = True
-                build_strategy.fuse_bn_act_ops = True
-                build_strategy.enable_auto_fusion = True
-                build_strategy.fuse_relu_depthwise_conv = True
-                build_strategy.fuse_broadcast_ops = True
-                build_strategy.fuse_all_optimizer_ops = True
-                build_strategy.enable_inplace = True
+                >>> import paddle
+                >>> build_strategy = paddle.static.BuildStrategy()
+                >>> build_strategy.enable_sequential_execution = True
+                >>> build_strategy.fuse_elewise_add_act_ops = True
+                >>> build_strategy.fuse_bn_act_ops = True
+                >>> build_strategy.enable_auto_fusion = True
+                >>> build_strategy.fuse_relu_depthwise_conv = True
+                >>> build_strategy.fuse_broadcast_ops = True
+                >>> build_strategy.fuse_all_optimizer_ops = True
+                >>> build_strategy.enable_inplace = True
 
-                strategy = paddle.distributed.fleet.DistributedStrategy()
-                strategy.build_strategy = build_strategy
+                >>> strategy = paddle.distributed.fleet.DistributedStrategy()
+                >>> strategy.build_strategy = build_strategy
 
         """
 
@@ -302,9 +307,9 @@ def gradient_scale_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
 
         Note that, strategy must be in 'avg', 'sum' or 'customized'
 
@@ -333,15 +338,15 @@ def a_sync(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                role_maker = fleet.PaddleCloudRoleMaker()
-                fleet.init(role_maker)
+                >>> import paddle.distributed.fleet as fleet
+                >>> role_maker = fleet.PaddleCloudRoleMaker()
+                >>> fleet.init(role_maker)
 
-                strategy = fleet.DistributedStrategy()
-                strategy.a_sync = True  # by default this is True
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.a_sync = True  # by default this is True
 
-                # code block for defining loss and local optimizer
-                # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                >>> # code block for defining loss and local optimizer
+                >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.a_sync
@@ -385,17 +390,17 @@ def a_sync_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                role_maker = fleet.PaddleCloudRoleMaker()
-                fleet.init(role_maker)
+                >>> import paddle.distributed.fleet as fleet
+                >>> role_maker = fleet.PaddleCloudRoleMaker()
+                >>> fleet.init(role_maker)
 
-                strategy = fleet.DistributedStrategy()
-                strategy.a_sync = True  # by default this is True
-                configs = {"k_steps": 1024, "send_queue_size": 32}
-                strategy.a_sync_configs = configs
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.a_sync = True  # by default this is True
+                >>> configs = {"k_steps": 1024, "send_queue_size": 32}
+                >>> strategy.a_sync_configs = configs
 
-                # code block for defining loss and local optimizer
-                # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                >>> # code block for defining loss and local optimizer
+                >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return get_msg_dict(self.strategy.a_sync_configs)
@@ -426,16 +431,16 @@ def trainer_desc_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                role_maker = fleet.PaddleCloudRoleMaker()
-                fleet.init(role_maker)
+                >>> import paddle.distributed.fleet as fleet
+                >>> role_maker = fleet.PaddleCloudRoleMaker()
+                >>> fleet.init(role_maker)
 
-                strategy = fleet.DistributedStrategy()
-                configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
-                strategy.trainer_desc_configs = configs
+                >>> strategy = fleet.DistributedStrategy()
+                >>> configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
+                >>> strategy.trainer_desc_configs = configs
 
-                # code block for defining loss and local optimizer
-                # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                >>> # code block for defining loss and local optimizer
+                >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return get_msg_dict(self.strategy.trainer_desc_configs)
@@ -450,15 +455,15 @@ def adam_d2sum(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                role_maker = fleet.PaddleCloudRoleMaker()
-                fleet.init(role_maker)
+                >>> import paddle.distributed.fleet as fleet
+                >>> role_maker = fleet.PaddleCloudRoleMaker()
+                >>> fleet.init(role_maker)
 
-                strategy = fleet.DistributedStrategy()
-                strategy.adam_d2sum = True  # by default this is False
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.adam_d2sum = True  # by default this is False
 
-                # code block for defining loss and local optimizer
-                # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                >>> # code block for defining loss and local optimizer
+                >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.adam_d2sum
@@ -501,14 +506,14 @@ def fs_client_param(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                role_maker = fleet.PaddleCloudRoleMaker()
-                fleet.init(role_maker)
-                strategy = fleet.DistributedStrategy()
-                configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"}
-                strategy.fs_client_param = configs
-                # code block for defining loss and local optimizer
-                # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                >>> import paddle.distributed.fleet as fleet
+                >>> role_maker = fleet.PaddleCloudRoleMaker()
+                >>> fleet.init(role_maker)
+                >>> strategy = fleet.DistributedStrategy()
+                >>> configs = {"uri": "xxx", "user": "xxx", "passwd": "xxx"}
+                >>> strategy.fs_client_param = configs
+                >>> # code block for defining loss and local optimizer
+                >>> # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.fs_client_param
@@ -880,11 +885,11 @@ def amp(self):
 
         Examples:
 
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.amp = True # by default this is false
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.amp = True # by default this is false
 
         """
         return self.strategy.amp
@@ -928,27 +933,29 @@ def amp_configs(self):
             use_fp16_guard(bool): Whether to use `fp16_guard` when constructing the program.
                    Default True. Only takes effect when `use_pure_fp16` is turned on.
 
-        Examples 1:
+        Examples:
             .. code-block:: python
+                :name:example_1
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.amp = True
-                strategy.amp_configs = {
-                    "init_loss_scaling": 32768,
-                    "custom_white_list": ['conv2d']}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.amp = True
+                >>> strategy.amp_configs = {
+                ...     "init_loss_scaling": 32768,
+                ...     "custom_white_list": ['conv2d']
+                ... }
 
-        Examples 2:
             .. code-block:: python
+                :name:example_2
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.amp = True
-                # pure fp16
-                strategy.amp_configs = {
-                    "init_loss_scaling": 32768,
-                    "use_pure_fp16": True
-                }
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.amp = True
+                >>> # pure fp16
+                >>> strategy.amp_configs = {
+                ...     "init_loss_scaling": 32768,
+                ...     "use_pure_fp16": True
+                ... }
 
         """
         return get_msg_dict(self.strategy.amp_configs)
@@ -969,9 +976,9 @@ def asp(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.asp = True # by default this is false
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.asp = True # by default this is false
 
         """
         return self.strategy.asp
@@ -992,11 +999,11 @@ def qat(self):
 
         Examples:
 
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.qat = True # by default this is false
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.qat = True # by default this is false
 
         """
         return self.strategy.qat
@@ -1019,16 +1026,20 @@ def qat_configs(self):
             not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
                 the corresponding op will not be quantized.
             algo(str): Other quantization training algorithm.
-        Exampless:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.qat = True
-            strategy.qat_configs = {
-                "channel_wise_abs_max": True,
-                "weight_bits": 8,
-                "activation_bits: 8,
-                "not_quant_pattern": ['skip_quant']}
+
+        Examples:
+            .. code-block:: python
+
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.qat = True
+                >>> strategy.qat_configs = {
+                ...     "channel_wise_abs_max": True,
+                ...     "weight_bits": 8,
+                ...     "activation_bits": 8,
+                ...     "not_quant_pattern": ['skip_quant']
+                ... }
+
         """
         return get_msg_dict(self.strategy.qat_configs)
 
@@ -1046,11 +1057,11 @@ def recompute(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.recompute = True
-                # suppose x and y are names of checkpoint tensors for recomputation
-                strategy.recompute_configs = {"checkpoints": ["x", "y"]}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.recompute = True
+                >>> # suppose x and y are names of checkpoint tensors for recomputation
+                >>> strategy.recompute_configs = {"checkpoints": ["x", "y"]}
 
         """
         return self.strategy.recompute
@@ -1065,9 +1076,9 @@ def sync_nccl_allreduce(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.sync_nccl_allreduce = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.sync_nccl_allreduce = True
 
         """
         return self.strategy.sync_nccl_allreduce
@@ -1091,9 +1102,9 @@ def use_hierarchical_allreduce(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.use_hierarchical_allreduce = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.use_hierarchical_allreduce = True
 
         """
         return self.strategy.use_hierarchical_allreduce
@@ -1118,9 +1129,9 @@ def hierarchical_allreduce_inter_nranks(self):
         Example:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.hierarchical_allreduce_inter_nranks = 8
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.hierarchical_allreduce_inter_nranks = 8
 
         """
         return self.strategy.hierarchical_allreduce_inter_nranks
@@ -1146,9 +1157,9 @@ def sync_batch_norm(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.sync_batch_norm = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.sync_batch_norm = True
 
         """
 
@@ -1172,9 +1183,9 @@ def fuse_all_reduce_ops(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.fuse_all_reduce_ops = False
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.fuse_all_reduce_ops = False
 
         """
         return self.strategy.fuse_all_reduce_ops
@@ -1198,9 +1209,9 @@ def fuse_grad_size_in_MB(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.fuse_grad_size_in_MB = 50
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.fuse_grad_size_in_MB = 50
 
         """
         return self.strategy.fuse_grad_size_in_MB
@@ -1226,9 +1237,9 @@ def last_comm_group_size_MB(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.last_comm_group_size_MB = 2
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.last_comm_group_size_MB = 2
 
         """
         return self.strategy.last_comm_group_size_MB
@@ -1253,9 +1264,9 @@ def find_unused_parameters(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.find_unused_parameters = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.find_unused_parameters = True
 
         """
 
@@ -1296,9 +1307,9 @@ def nccl_comm_num(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.nccl_comm_num = 2
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.nccl_comm_num = 2
 
         """
 
@@ -1342,13 +1353,14 @@ def recompute_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.recompute = True
-                strategy.recompute_configs = {
-                    "checkpoints": ["x", "y"],
-                    "enable_offload": True,
-                    "checkpoint_shape": [100, 512, 1024] }
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.recompute = True
+                >>> strategy.recompute_configs = {
+                ...     "checkpoints": ["x", "y"],
+                ...     "enable_offload": True,
+                ...     "checkpoint_shape": [100, 512, 1024]
+                ... }
 
         """
         return get_msg_dict(self.strategy.recompute_configs)
@@ -1377,9 +1389,9 @@ def sharding(self):
         Examples:
             .. code-block:: python
 
-                import paddle.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.sharding = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.sharding = True
 
         """
         return self.strategy.sharding
@@ -1435,17 +1447,17 @@ def sharding_configs(self):
         Examples:
             .. code-block:: python
 
-                # sharding-DP, 2 nodes with 8 gpus per node
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.sharding = True
-                strategy.sharding_configs = {
-                    "sharding_segment_strategy": "segment_broadcast_MB",
-                    "segment_broadcast_MB": 32,
-                    "sharding_degree": 8,
-                    "dp_degree": 2,
-                    "gradient_merge_acc_step": 4,
-                    }
+                >>> # sharding-DP, 2 nodes with 8 gpus per node
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.sharding = True
+                >>> strategy.sharding_configs = {
+                ...     "sharding_segment_strategy": "segment_broadcast_MB",
+                ...     "segment_broadcast_MB": 32,
+                ...     "sharding_degree": 8,
+                ...     "dp_degree": 2,
+                ...     "gradient_merge_acc_step": 4,
+                ... }
 
         """
         return get_msg_dict(self.strategy.sharding_configs)
@@ -1467,9 +1479,9 @@ def without_graph_optimization(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.without_graph_optimization = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.without_graph_optimization = True
 
         """
         return self.strategy.without_graph_optimization
@@ -1495,9 +1507,9 @@ def _calc_comm_same_stream(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.calc_comm_same_stream = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy._calc_comm_same_stream = True
 
         """
         return self.strategy.calc_comm_same_stream
@@ -1523,9 +1535,9 @@ def fuse_grad_merge(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.fuse_param_grad = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.fuse_grad_merge = True
 
         """
         return self.strategy.fuse_grad_merge
@@ -1547,10 +1559,10 @@ def fuse_grad_size_in_num(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.fuse_grad_size_in_num = 2
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.fuse_grad_size_in_num = 2
 
         """
         return self.strategy.fuse_grad_size_in_num
@@ -1577,9 +1589,9 @@ def pipeline(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.pipeline = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.pipeline = True
 
         """
         return self.strategy.pipeline
@@ -1637,10 +1649,10 @@ def pipeline_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.pipeline = True
-                strategy.pipeline_configs = {"micro_batch_size": 12}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.pipeline = True
+                >>> strategy.pipeline_configs = {"micro_batch_size": 12}
 
         """
 
@@ -1663,9 +1675,9 @@ def tensor_parallel(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.tensor_parallel = True
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.tensor_parallel = True
 
         """
         return self.strategy.tensor_parallel
@@ -1695,11 +1707,11 @@ def tensor_parallel_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.tensor_parallel = True
-                strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
-                                                    "tensor_init_seed": 123}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.tensor_parallel = True
+                >>> strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
+                ...                                     "tensor_init_seed": 123}
 
         """
         return get_msg_dict(self.strategy.tensor_parallel_configs)
@@ -1739,13 +1751,14 @@ def hybrid_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.hybrid_configs = {
-                    "dp_degree": 1,
-                    "mp_degree": 2,
-                    "pp_degree": 1,
-                    "order":['dp','pp','sharding', 'sep', 'mp']}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.hybrid_configs = {
+                ...     "dp_degree": 1,
+                ...     "mp_degree": 2,
+                ...     "pp_degree": 1,
+                ...     "order":['dp','pp','sharding', 'sep', 'mp']
+                ... }
 
         """
         return get_msg_dict(self.strategy.hybrid_configs)
@@ -1786,13 +1799,12 @@ def localsgd(self):
         For more details, please refer to
         `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
 
-
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.localsgd = True # by default this is false
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.localsgd = True # by default this is false
 
         """
         return self.strategy.localsgd
@@ -1819,11 +1831,11 @@ def localsgd_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.localsgd = True
-                strategy.localsgd_configs = {"k_steps": 4,
-                                            "begin_step": 30}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.localsgd = True
+                >>> strategy.localsgd_configs = {"k_steps": 4,
+                ...                             "begin_step": 30}
 
         """
 
@@ -1848,9 +1860,9 @@ def adaptive_localsgd(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.adaptive_localsgd = True # by default this is false
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.adaptive_localsgd = True # by default this is false
 
         """
         return self.strategy.adaptive_localsgd
@@ -1880,11 +1892,11 @@ def adaptive_localsgd_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.adaptive_localsgd = True
-                strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
-                                                    "begin_step": 30}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.adaptive_localsgd = True
+                >>> strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
+                ...                                       "begin_step": 30}
 
         """
 
@@ -1912,9 +1924,9 @@ def dgc(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.dgc = True # by default this is false
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.dgc = True # by default this is false
 
         """
         return self.strategy.dgc
@@ -1949,10 +1961,10 @@ def dgc_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.dgc = True
-                strategy.dgc_configs = {"rampup_begin_step": 1252}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.dgc = True
+                >>> strategy.dgc_configs = {"rampup_begin_step": 1252}
 
         """
         return get_msg_dict(self.strategy.dgc_configs)
@@ -1973,10 +1985,10 @@ def fp16_allreduce(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.fp16_allreduce = True # by default this is false
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.fp16_allreduce = True # by default this is false
 
         """
         return self.strategy.fp16_allreduce
@@ -2004,10 +2016,10 @@ def gradient_merge(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.gradient_merge = True
-                strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.gradient_merge = True
+                >>> strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
 
         """
         return self.strategy.gradient_merge
@@ -2034,10 +2046,10 @@ def gradient_merge_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.gradient_merge = True
-                strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.gradient_merge = True
+                >>> strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
 
         """
         return get_msg_dict(self.strategy.gradient_merge_configs)
@@ -2063,9 +2075,9 @@ def lars(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.lars = True # by default this is false
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.lars = True # by default this is false
 
         """
         return self.strategy.lars
@@ -2095,15 +2107,15 @@ def lars_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.lars = True
-                strategy.lars_configs = {
-                            "lars_coeff": 0.01,
-                            "lars_weight_decay": 0.0005,
-                            "epsilon": 0,
-                            "exclude_from_weight_decay": ['batch_norm', '.b_0']
-                        }
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.lars = True
+                >>> strategy.lars_configs = {
+                ...             "lars_coeff": 0.01,
+                ...             "lars_weight_decay": 0.0005,
+                ...             "epsilon": 0,
+                ...             "exclude_from_weight_decay": ['batch_norm', '.b_0']
+                ... }
 
         """
         return get_msg_dict(self.strategy.lars_configs)
@@ -2128,9 +2140,9 @@ def lamb(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.lamb = True # by default this is false
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.lamb = True # by default this is false
 
         """
 
@@ -2158,13 +2170,13 @@ def lamb_configs(self):
         Examples:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.lamb = True
-                strategy.lamb_configs = {
-                        'lamb_weight_decay': 0.01,
-                        'exclude_from_weight_decay': [],
-                    }
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.lamb = True
+                >>> strategy.lamb_configs = {
+                ...         'lamb_weight_decay': 0.01,
+                ...         'exclude_from_weight_decay': [],
+                ... }
 
         """
         return get_msg_dict(self.strategy.lamb_configs)
@@ -2207,17 +2219,17 @@ def auto(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.auto = True
-                # if set other strategy at the same time, auto will not apply
-                # strategy.amp = True
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.auto = True
+                >>> # if set other strategy at the same time, auto will not apply
+                >>> # strategy.amp = True
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-                optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.auto
@@ -2243,17 +2255,17 @@ def semi_auto(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.semi_auto = True
-                # if set other strategy at the same time, auto will not apply
-                # strategy.amp = True
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.semi_auto = True
+                >>> # if set other strategy at the same time, auto will not apply
+                >>> # strategy.amp = True
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-                optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.semi_auto
@@ -2276,12 +2288,12 @@ def auto_search(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.auto_search = True
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.auto_search = True
 
         """
         return self.strategy.auto_search
@@ -2303,12 +2315,12 @@ def split_data(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
-                strategy = fleet.DistributedStrategy()
-                strategy.split_data = True
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.split_data = True
 
         """
         return self.strategy.split_data
@@ -2359,15 +2371,16 @@ def qat_configs(self):
         Exampless:
             .. code-block:: python
 
-                import paddle.distributed.fleet as fleet
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.qat = True
-                strategy.qat_configs = {
-                    "channel_wise_abs_max": True,
-                    "weight_bits": 8,
-                    "activation_bits: 8,
-                    "not_quant_pattern": ['skip_quant']}
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.qat = True
+                >>> strategy.qat_configs = {
+                ...     "channel_wise_abs_max": True,
+                ...     "weight_bits": 8,
+                ...     "activation_bits": 8,
+                ...     "not_quant_pattern": ['skip_quant']
+                ... }
 
         """
         return get_msg_dict(self.strategy.qat_configs)
@@ -2389,15 +2402,15 @@ def heter_ccl_mode(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.distributed.fleet as fleet
+                >>> import paddle
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.heter_ccl_mode = True
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.heter_ccl_mode = True
 
-                # for initialize parallel env, only need to call
-                paddle.distributed.init_parallel_env()
-                # then the heterogenous context will be created.
+                >>> # for initialize parallel env, only need to call
+                >>> paddle.distributed.init_parallel_env()
+                >>> # then the heterogenous context will be created.
 
         """
         return self.strategy.heter_ccl_mode
@@ -2422,15 +2435,15 @@ def cudnn_exhaustive_search(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.cudnn_exhaustive_search = False
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.cudnn_exhaustive_search = False
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-                optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.cudnn_exhaustive_search
@@ -2458,15 +2471,15 @@ def conv_workspace_size_limit(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.conv_workspace_size_limit = 1024
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.conv_workspace_size_limit = 1024
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-                optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.conv_workspace_size_limit
@@ -2492,15 +2505,15 @@ def cudnn_batchnorm_spatial_persistent(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
-                import paddle.distributed.fleet as fleet
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> import paddle.distributed.fleet as fleet
 
-                strategy = fleet.DistributedStrategy()
-                strategy.cudnn_batchnorm_spatial_persistent = True
+                >>> strategy = fleet.DistributedStrategy()
+                >>> strategy.cudnn_batchnorm_spatial_persistent = True
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-                optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                >>> optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.cudnn_batchnorm_spatial_persistent
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 7b9cf269dcd26..ddc6c411598c3 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -551,20 +551,20 @@ class PaddleCloudRoleMaker(RoleMakerBase):
     Examples:
         .. code-block:: python
 
-            import os
-            import paddle.distributed.fleet as fleet
+            >>> import os
+            >>> import paddle.distributed.fleet as fleet
 
-            os.environ["PADDLE_PSERVER_NUMS"] = "2"
-            os.environ["PADDLE_TRAINERS_NUM"] = "2"
+            >>> os.environ["PADDLE_PSERVER_NUMS"] = "2"
+            >>> os.environ["PADDLE_TRAINERS_NUM"] = "2"
 
-            os.environ["POD_IP"] = "127.0.0.1"
-            os.environ["PADDLE_PORT"] = "36001"
-            os.environ["TRAINING_ROLE"] = "PSERVER"
-            os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+            >>> os.environ["POD_IP"] = "127.0.0.1"
+            >>> os.environ["PADDLE_PORT"] = "36001"
+            >>> os.environ["TRAINING_ROLE"] = "PSERVER"
+            >>> os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
 
-            os.environ["PADDLE_TRAINER_ID"] = "0"
+            >>> os.environ["PADDLE_TRAINER_ID"] = "0"
 
-            fleet.PaddleCloudRoleMaker(is_collective=False)
+            >>> fleet.PaddleCloudRoleMaker(is_collective=False)
 
     """
 
@@ -1211,14 +1211,14 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
     Examples:
         .. code-block:: python
 
-            import paddle.distributed.fleet as fleet
-            from paddle.distributed.fleet.base.role_maker import Role
+            >>> import paddle.distributed.fleet as fleet
+            >>> from paddle.distributed.fleet.base.role_maker import Role
 
-            fleet.UserDefinedRoleMaker(
-                current_id=0,
-                role=Role.SERVER,
-                worker_num=2,
-                server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])
+            >>> fleet.UserDefinedRoleMaker(
+            ...     current_id=0,
+            ...     role=Role.SERVER,
+            ...     worker_num=2,
+            ...     server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])
     """
 
     def __init__(self, is_collective=False, init_gloo=False, **kwargs):

From d9d00a1005a7ba68f9e14234bc89b764be94156a Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Mon, 25 Sep 2023 10:56:40 +0800
Subject: [PATCH 093/115] Remove reduntant attn_gemm.h. (#57671)

* Remove reduntant attn_gemm.h.

* Add namespace to the call.
---
 paddle/fluid/operators/fused/attn_gemm.h      | 295 ------------------
 .../operators/fused/fused_gate_attention.h    |   1 +
 .../fused/fused_gate_attention_op.cu          |  34 +-
 .../fused/fused_multi_transformer_op.cu       |  38 +--
 .../fused/fused_multi_transformer_op.cu.h     |   2 +-
 paddle/phi/kernels/fusion/gpu/attn_gemm.h     |  26 +-
 6 files changed, 52 insertions(+), 344 deletions(-)
 delete mode 100644 paddle/fluid/operators/fused/attn_gemm.h

diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
deleted file mode 100644
index 277e29c4d59ce..0000000000000
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h"
-#include "paddle/phi/kernels/primitive/kernel_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-// support gemm-nt and gemm-nn, which is used in fused_attention_op.
-template <typename T>
-class AttnMatMul {
- public:
-  // (m, n, k) = bsz_seq, output_size, input_size
-  AttnMatMul(const phi::GPUContext& dev_ctx,
-             bool transA,
-             bool transB,
-             int bsz_seq,
-             int output_size,
-             int input_size,
-             bool compute_bias)
-      : dev_ctx_(dev_ctx),
-        transA_(transA),
-        transB_(transB),
-        bsz_seq_(bsz_seq),
-        output_size_(output_size),
-        input_size_(input_size),
-        compute_bias_(compute_bias) {}
-
-  void ComputeForward(const phi::DenseTensor* weight,
-                      const phi::DenseTensor* input,
-                      const phi::DenseTensor* bias,
-                      phi::DenseTensor* output,
-                      phi::DenseTensor* bias_out,
-                      bool fused = false) {
-    VLOG(6) << "input.shape={" << input->dims() << "}, weight.shape={"
-            << weight->dims() << "}, output.shape={" << output->dims()
-            << "}, batch_size=" << bsz_seq_ << ", output_size=" << output_size_
-            << ", input_size=" << input_size_ << ", transA=" << transA_
-            << ", transB=" << transB_ << ", compute_bias=" << compute_bias_
-            << ", fused=" << fused;
-
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-    if (compute_bias_ && fused) {
-      PADDLE_ENFORCE_EQ(
-          !output || output == bias_out,
-          true,
-          phi::errors::InvalidArgument(
-              "The output (= input * weight) is expected to be nullptr or the "
-              "same as bias_out when fused is true."));
-
-      phi::funcs::LinearWithCublasLt<T>::Run(
-          dev_ctx_,
-          input,                                      // x
-          weight,                                     // y
-          bias_out,                                   // out
-          static_cast<const void*>(bias->data<T>()),  // bias
-          nullptr,
-          bsz_seq_,      // M
-          output_size_,  // N
-          input_size_,   // K
-          transA_,
-          transB_,
-          phi::funcs::MatmulFusedType::kMatmulBias);
-
-      return;
-    }
-#endif
-
-    // Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
-    // here: (transa, transb): nt, input * weight.
-    CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans;
-    CBLAS_TRANSPOSE transB = transB_ ? CblasTrans : CblasNoTrans;
-    T alpha = static_cast<T>(1.0);
-    T beta = static_cast<T>(0.0);
-
-    // (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
-    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
-    blas.GEMM(transA,
-              transB,
-              bsz_seq_,
-              output_size_,
-              input_size_,
-              alpha,
-              input->data<T>(),
-              weight->data<T>(),
-              beta,
-              output->data<T>());
-    if (compute_bias_) {
-      // bias_out = output + bias
-      std::vector<const phi::DenseTensor*> ins = {output, bias};
-      std::vector<phi::DenseTensor*> outs = {bias_out};
-      phi::funcs::BroadcastKernel<T>(
-          dev_ctx_, ins, &outs, phi::funcs::AddFunctor<T>());
-    }
-  }
-
-  void ComputeBackward(const phi::DenseTensor* input,
-                       const phi::DenseTensor* weight,
-                       const phi::DenseTensor* d_output,
-                       phi::DenseTensor* d_input,
-                       phi::DenseTensor* d_weight,
-                       phi::DenseTensor* d_bias,
-                       bool use_addto = false,
-                       bool fused = false) {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-    if (compute_bias_ && fused) {
-      phi::funcs::ComputeFusedGemmEpilogueBackward<T>(dev_ctx_,
-                                                      d_output,
-                                                      input,
-                                                      weight,
-                                                      nullptr,
-                                                      bsz_seq_,      // M
-                                                      output_size_,  // N
-                                                      input_size_,   // K
-                                                      transA_,
-                                                      transB_,
-                                                      "none",
-                                                      d_input,
-                                                      d_weight,
-                                                      d_bias,
-                                                      use_addto);
-      return;
-    }
-#endif
-
-    T alpha = static_cast<T>(1.0);
-    T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
-    T beta_dB = static_cast<T>(0.0);
-
-    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
-    if (!transA_) {
-      // forward: gemm-nt
-      if (transB_) {
-        // backward: gemm-tn, dB = (dC)^T * A
-        if (d_weight) {
-          int dB_m = output_size_;
-          int dB_n = input_size_;
-          int dB_k = bsz_seq_;
-
-          T* dB_output_ptr = d_weight->data<T>();
-          blas.GEMM(CblasTrans,
-                    CblasNoTrans,
-                    dB_m,
-                    dB_n,
-                    dB_k,
-                    alpha,
-                    d_output->data<T>(),
-                    input->data<T>(),
-                    beta_dB,
-                    dB_output_ptr);
-        }
-
-        // backward: gemm-nn, dA = dC * B
-        if (d_input) {
-          int dA_m = bsz_seq_;
-          int dA_n = input_size_;
-          int dA_k = output_size_;
-
-          T* dA_output_ptr = d_input->data<T>();
-          blas.GEMM(CblasNoTrans,
-                    CblasNoTrans,
-                    dA_m,
-                    dA_n,
-                    dA_k,
-                    alpha,
-                    d_output->data<T>(),
-                    weight->data<T>(),
-                    beta_dA,
-                    dA_output_ptr);
-        }
-      } else {  // fw: gemm-nn
-        // backward: gemm-tn, dB = A^T * dC
-        if (d_weight) {
-          int dB_m = input_size_;
-          int dB_n = output_size_;
-          int dB_k = bsz_seq_;
-
-          T* dB_output_ptr = d_weight->data<T>();
-          blas.GEMM(CblasTrans,
-                    CblasNoTrans,
-                    dB_m,
-                    dB_n,
-                    dB_k,
-                    alpha,
-                    input->data<T>(),
-                    d_output->data<T>(),
-                    beta_dB,
-                    dB_output_ptr);
-        }
-
-        // backward: gemm-nt, dA = dC * B^T
-        if (d_input) {
-          int dA_m = bsz_seq_;
-          int dA_n = input_size_;
-          int dA_k = output_size_;
-
-          T* dA_output_ptr = d_input->data<T>();
-          blas.GEMM(CblasNoTrans,
-                    CblasTrans,
-                    dA_m,
-                    dA_n,
-                    dA_k,
-                    alpha,
-                    d_output->data<T>(),
-                    weight->data<T>(),
-                    beta_dA,
-                    dA_output_ptr);
-        }
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "AttnMatMul wrapper do not support (transA=T, transB=T/N)"
-          "parameters."));
-    }
-    if (compute_bias_ && d_bias) {
-      // reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2} or {0,1,2,3}
-      // -> {3} or {0,1,2,3,4} -> {3,4}
-      const auto input_dims = d_output->dims();
-      const auto output_dims = d_bias->dims();
-      bool support_case_1 =
-          (input_dims.size() == 5 && output_dims.size() == 3 &&
-           (input_dims[2] == output_dims[0]) &&
-           (input_dims[3] == output_dims[1]) &&
-           (input_dims[4] == output_dims[2]));
-      bool support_case_2 =
-          (input_dims.size() == 3 && output_dims.size() == 1 &&
-           (input_dims[2] == output_dims[0]));
-      bool support_case_3 =
-          (input_dims.size() == 4 && output_dims.size() == 1 &&
-           input_dims[3] == output_dims[0]);
-      bool support_case_4 =
-          (input_dims.size() == 5 && output_dims.size() == 2 &&
-           input_dims[3] == output_dims[0] && input_dims[4] == output_dims[1]);
-
-      gpuStream_t stream = dev_ctx_.stream();
-      if (support_case_1 || support_case_2) {
-        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            dev_ctx_,
-            *d_output,
-            d_bias,
-            kps::IdentityFunctor<T>(),
-            {0, 1},
-            stream);
-      } else if (support_case_3 || support_case_4) {
-        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            dev_ctx_,
-            *d_output,
-            d_bias,
-            kps::IdentityFunctor<T>(),
-            {0, 1, 2},
-            stream);
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Only support reduce when the input dims are [0,1,2,3,4] and "
-            "output is [2,3,4]"
-            "or input is [0,1,2] and output is [2]."));
-      }
-    }
-  }
-
- private:
-  const phi::GPUContext& dev_ctx_;
-
-  bool transA_;
-  bool transB_;
-
-  int bsz_seq_;
-  int output_size_;
-  int input_size_;
-
-  int compute_bias_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index 5cbc4788a0c68..89f17f24b74a1 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/funcs/transpose_function.cu.h"
+#include "paddle/phi/kernels/fusion/gpu/attn_gemm.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index e2cdb513feada..9caca507c08bb 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fused_gate_attention.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/fusion/gpu/attn_gemm.h"
 
 namespace paddle {
 namespace operators {
@@ -73,8 +73,8 @@ void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = 3 * config.num_heads * config.head_dim;
   int k = config.q_dim;
-  auto qkv_compute =
-      AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
+  auto qkv_compute = phi::fusion::AttnMatMul<T>(
+      ctx.cuda_device_context(), false, true, m, n, k, false);
   qkv_compute.ComputeForward(qkv_weight, query, nullptr, qkv_out, nullptr);
 }
 
@@ -95,8 +95,8 @@ void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = 3 * config.num_heads * config.head_dim;
   int k = config.q_dim;
-  auto qkv_compute =
-      AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
+  auto qkv_compute = phi::fusion::AttnMatMul<T>(
+      ctx.cuda_device_context(), false, true, m, n, k, false);
   qkv_compute.ComputeBackward(query,
                               qkv_weight,
                               qkv_out_grad,
@@ -125,7 +125,7 @@ void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
   int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int q_n = config.num_heads * config.head_dim;
   int q_k = config.q_dim;
-  auto q_compute = AttnMatMul<T>(
+  auto q_compute = phi::fusion::AttnMatMul<T>(
       ctx.cuda_device_context(), false, false, q_m, q_n, q_k, false);
   q_compute.ComputeForward(query_weight, query, nullptr, query_out, nullptr);
 
@@ -136,7 +136,7 @@ void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
   int kv_m = config.batch_size * config.seq_len_m * config.m_size;
   int kv_n = config.num_heads * config.head_dim;
   int kv_k = config.kv_dim;
-  auto kv_compute = AttnMatMul<T>(
+  auto kv_compute = phi::fusion::AttnMatMul<T>(
       ctx.cuda_device_context(), false, false, kv_m, kv_n, kv_k, false);
   kv_compute.ComputeForward(key_weight, key, nullptr, key_out, nullptr);
 
@@ -165,7 +165,7 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
   int kv_m = config.batch_size * config.seq_len_m * config.m_size;
   int kv_n = config.num_heads * config.head_dim;
   int kv_k = config.kv_dim;
-  auto kv_compute = AttnMatMul<T>(
+  auto kv_compute = phi::fusion::AttnMatMul<T>(
       ctx.cuda_device_context(), false, false, kv_m, kv_n, kv_k, false);
   kv_compute.ComputeBackward(
       key, key_weight, key_out_grad, key_grad, key_weight_grad, nullptr, false);
@@ -193,7 +193,7 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
   int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int q_n = config.num_heads * config.head_dim;
   int q_k = config.q_dim;
-  auto q_compute = AttnMatMul<T>(
+  auto q_compute = phi::fusion::AttnMatMul<T>(
       ctx.cuda_device_context(), false, false, q_m, q_n, q_k, false);
   q_compute.ComputeBackward(query,
                             query_weight,
@@ -221,8 +221,8 @@ void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.num_heads * config.head_dim;
   int k = config.q_dim;
-  auto gate_linear =
-      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  auto gate_linear = phi::fusion::AttnMatMul<T>(
+      ctx.cuda_device_context(), false, false, m, n, k, true);
   gate_linear.ComputeForward(gate_weight,
                              query,
                              gate_bias,
@@ -258,8 +258,8 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.num_heads * config.head_dim;
   int k = config.q_dim;
-  auto gate_linear =
-      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  auto gate_linear = phi::fusion::AttnMatMul<T>(
+      ctx.cuda_device_context(), false, false, m, n, k, true);
   gate_linear.ComputeForward(gate_weight,
                              query,
                              gate_bias,
@@ -307,8 +307,8 @@ void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.q_dim;
   int k = config.num_heads * config.head_dim;
-  auto out_linear =
-      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  auto out_linear = phi::fusion::AttnMatMul<T>(
+      ctx.cuda_device_context(), false, false, m, n, k, true);
   out_linear.ComputeForward(out_linear_weight,
                             fmha_or_gate_out,
                             out_linear_bias,
@@ -342,8 +342,8 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.q_dim;
   int k = config.num_heads * config.head_dim;
-  auto out_linear =
-      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  auto out_linear = phi::fusion::AttnMatMul<T>(
+      ctx.cuda_device_context(), false, false, m, n, k, true);
   out_linear.ComputeBackward(input,
                              out_linear_weight,
                              out_grad,
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index aa7c4cb4fd9f8..e3158d74df629 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -109,13 +109,13 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // (transA, transB, compute_bias) = (false, trans_qkvw, false)
     // Since we fused QKVBias into QKVBiasAddTransposeSplit kernel, here we set
     // compute_bias as false.
-    auto qkv_compute = AttnMatMul<T>(dev_ctx,
-                                     false,
-                                     trans_qkvw,
-                                     token_num,
-                                     output_size,
-                                     input_size,
-                                     /*compute_bias=*/false);
+    auto qkv_compute = phi::fusion::AttnMatMul<T>(dev_ctx,
+                                                  false,
+                                                  trans_qkvw,
+                                                  token_num,
+                                                  output_size,
+                                                  input_size,
+                                                  /*compute_bias=*/false);
 
     phi::DenseTensor qkv_out;
     qkv_out.Resize({{token_num, 3, num_head, dim_head}});
@@ -219,7 +219,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto out_linear_biases = ctx.MultiInput<phi::DenseTensor>("OutLinearBias");
     int ring_id = ctx.Attr<int>("ring_id");
     // (transA, transB, compute_bias) = (false, false, false)
-    auto out_linear_compute = AttnMatMul<T>(
+    auto out_linear_compute = phi::fusion::AttnMatMul<T>(
         dev_ctx, false, false, token_num, dim_embed, hidden_size, false);
 
     // 5. ln(residual + bias)
@@ -260,7 +260,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto ffn2_weights = ctx.MultiInput<phi::DenseTensor>("FFN2Weight");
     auto ffn2_biases = ctx.MultiInput<phi::DenseTensor>("FFN2Bias");
 
-    auto ffn2_linear_compute = AttnMatMul<T>(
+    auto ffn2_linear_compute = phi::fusion::AttnMatMul<T>(
         dev_ctx, false, false, token_num, dim_embed, dim_ffn, false);
 
     // 8. ffn2 Layernorm residual bias
@@ -775,13 +775,13 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // (transA, transB, compute_bias) = (false, trans_qkvw, false)
     // Since we fused QKVBias into QKVBiasAddTransposeSplit kernel, here we
     // set compute_bias as false.
-    auto qkv_compute = AttnMatMul<T>(dev_ctx,
-                                     false,
-                                     trans_qkvw,
-                                     token_num,
-                                     output_size,
-                                     input_size,
-                                     /*compute_bias=*/false);
+    auto qkv_compute = phi::fusion::AttnMatMul<T>(dev_ctx,
+                                                  false,
+                                                  trans_qkvw,
+                                                  token_num,
+                                                  output_size,
+                                                  input_size,
+                                                  /*compute_bias=*/false);
 
     phi::DenseTensor qkv_out;
     qkv_out.Resize({{token_num, 3, num_head, dim_head}});
@@ -885,7 +885,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto out_linear_biases = ctx.MultiInput<phi::DenseTensor>("OutLinearBias");
     int ring_id = ctx.Attr<int>("ring_id");
     // (transA, transB, compute_bias) = (false, false, false)
-    auto out_linear_compute = AttnMatMul<T>(
+    auto out_linear_compute = phi::fusion::AttnMatMul<T>(
         dev_ctx, false, false, token_num, dim_embed, hidden_size, false);
 
     // 5. ln(residual + bias)
@@ -912,7 +912,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     auto ffn1_weight_dim = ffn1_weights[0]->dims();
 
     int dim_ffn = ffn1_weight_dim[1];
-    auto ffn1_linear_compute = AttnMatMul<T>(
+    auto ffn1_linear_compute = phi::fusion::AttnMatMul<T>(
         dev_ctx, false, false, token_num, dim_ffn, dim_embed, false);
     phi::DenseTensor ffn1_out;
     ffn1_out.Resize({{token_num, dim_ffn}});
@@ -934,7 +934,7 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
     // 8. ffn2 matmul
     auto ffn2_weights = ctx.MultiInput<phi::DenseTensor>("FFN2Weight");
     auto ffn2_biases = ctx.MultiInput<phi::DenseTensor>("FFN2Bias");
-    auto ffn2_linear_compute = AttnMatMul<T>(
+    auto ffn2_linear_compute = phi::fusion::AttnMatMul<T>(
         dev_ctx, false, false, token_num, dim_embed, dim_ffn, false);
 
     // 9. ffn2 residual bias
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index a81a38ed3877f..ba12bdc8b9d7f 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
-#include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
@@ -39,6 +38,7 @@ limitations under the License. */
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/fusion/gpu/attn_gemm.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
diff --git a/paddle/phi/kernels/fusion/gpu/attn_gemm.h b/paddle/phi/kernels/fusion/gpu/attn_gemm.h
index a96601dddacac..8b83ddab93b9b 100644
--- a/paddle/phi/kernels/fusion/gpu/attn_gemm.h
+++ b/paddle/phi/kernels/fusion/gpu/attn_gemm.h
@@ -73,18 +73,20 @@ class AttnMatMul {
           phi::errors::InvalidArgument(
               "The output (= input * weight) is expected to be nullptr or the "
               "same as bias_out when fused is true."));
-      phi::funcs::ComputeFusedGemmEpilogueForward<T>(dev_ctx_,
-                                                     input,
-                                                     weight,
-                                                     bias,
-                                                     bsz_seq_,      // M
-                                                     output_size_,  // N
-                                                     input_size_,   // K
-                                                     transA_,
-                                                     transB_,
-                                                     "none",
-                                                     bias_out,
-                                                     nullptr);
+
+      phi::funcs::LinearWithCublasLt<T>::Run(
+          dev_ctx_,
+          input,                                      // x
+          weight,                                     // y
+          bias_out,                                   // out
+          static_cast<const void*>(bias->data<T>()),  // bias
+          nullptr,
+          bsz_seq_,      // M
+          output_size_,  // N
+          input_size_,   // K
+          transA_,
+          transB_,
+          phi::funcs::MatmulFusedType::kMatmulBias);
       return;
     }
 #endif

From 1fcefb10a07111ce0b582137462636dfc24f9854 Mon Sep 17 00:00:00 2001
From: yinwei <1871465933@qq.com>
Date: Mon, 25 Sep 2023 10:57:49 +0800
Subject: [PATCH 094/115] fix solve_grad_kernel_impl SumKernel Parameters
 (#57672)

---
 paddle/phi/kernels/impl/solve_grad_kernel_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
index 9c5394e002201..7386e8beb22cb 100644
--- a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
@@ -65,7 +65,7 @@ struct ReduceSumForSolvelGrad<GPUContext, T> {
                   const std::vector<int>& reduce_dims,
                   bool keep_dims) {
     phi::SumKernel<T, GPUContext>(
-        dev_ctx, input, reduce_dims, input.dtype(), false, output);
+        dev_ctx, input, reduce_dims, output->dtype(), keep_dims, output);
   }
 };
 #endif

From 6a25e9dafbb6ddd78c378cb8f6a4b2c2cd5335de Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Mon, 25 Sep 2023 11:01:26 +0800
Subject: [PATCH 095/115] [Prim][PIR]Fix addn prim (#57675)

* support prim backward

* fix jit support pir prim

* move new_ir to pir

* add test case

* fix prim pir addn
---
 .../rule/vjp/generated/generated_vjp.cc.j2    |   4 +-
 python/paddle/decomposition/decomp.py         |  33 ++++-
 python/paddle/decomposition/rules.py          |   8 ++
 .../jit/dy2static/newir_partial_program.py    |   8 +-
 .../jit/dy2static/program_translator.py       |  26 ++--
 test/legacy_test/test_sum_op.py               |   3 +-
 test/prim/CMakeLists.txt                      |   2 +-
 .../{new_ir_prim => pir_prim}/CMakeLists.txt  |   5 +-
 .../test_custom_vjp_trait.py                  |   0
 .../test_decomp_op.py                         |   0
 test/prim/pir_prim/test_pir_prim_flags.py     | 128 ++++++++++++++++++
 .../test_prim_custom_vjp.py                   |   0
 .../test_prim_jit.py                          |   0
 .../test_prim_program.py                      |   0
 .../test_prim_simpnet.py                      |   0
 .../test_vjp_prim.py                          |   0
 16 files changed, 191 insertions(+), 26 deletions(-)
 rename test/prim/{new_ir_prim => pir_prim}/CMakeLists.txt (82%)
 rename test/prim/{new_ir_prim => pir_prim}/test_custom_vjp_trait.py (100%)
 rename test/prim/{new_ir_prim => pir_prim}/test_decomp_op.py (100%)
 create mode 100644 test/prim/pir_prim/test_pir_prim_flags.py
 rename test/prim/{new_ir_prim => pir_prim}/test_prim_custom_vjp.py (100%)
 rename test/prim/{new_ir_prim => pir_prim}/test_prim_jit.py (100%)
 rename test/prim/{new_ir_prim => pir_prim}/test_prim_program.py (100%)
 rename test/prim/{new_ir_prim => pir_prim}/test_prim_simpnet.py (100%)
 rename test/prim/{new_ir_prim => pir_prim}/test_vjp_prim.py (100%)

diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
index 6737a73d69eb5..18da2b7f05dc1 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
@@ -27,7 +27,9 @@ for (auto arg: stop_gradients) {
   vjp_res.push_back(std::vector<paddle::Tensor>(arg.size()));
 }
   {% if 'composite' in api and api.name in vjp_comp_white_list %}
-if (paddle::prim::StaticCompositeContext::Instance().IsBwdPrimEnabled()) {
+std::string op_name = "{{api.name}}";
+auto need_skip = paddle::prim::StaticCompositeContext::Instance().CheckSkipCompOps(op_name);
+if (paddle::prim::StaticCompositeContext::Instance().IsBwdPrimEnabled() && !need_skip) {
 {% filter indent(2, True) %}{{body_prim(api)}}{% endfilter %}
 } else {
 {% filter indent(2, True) %}{{body_unprim(api)}}{% endfilter %}
diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py
index 57e641e9c36a8..bfb9b6e9ba2c6 100644
--- a/python/paddle/decomposition/decomp.py
+++ b/python/paddle/decomposition/decomp.py
@@ -38,6 +38,10 @@ def _prepare_python_api_arguments(op):
     op (Operator): The target operator.
     """
     op_inputs = [x.source() for x in op.operands()]
+    # The inputs of PIR op builtin.combine will be restored as list of tensor.
+    if op.name() in ["builtin.combine"]:
+        return (op_inputs,)
+
     op_attrs_dict = op.attrs()
     op_attrs_name = op.get_attr_names()
     op_attrs = [op_attrs_dict[x] for x in op_attrs_name]
@@ -198,15 +202,28 @@ def _decompose_subgraph(block, orig_vars, dst_vars, op_filter):
 
     if isinstance(block, Block):
         ops_list = block.ops
-        for op in ops_list:
+        temp_op = None
+        temp_inputs = None
+        for idx, op in enumerate(ops_list):
             op_name = op.name()
             decom_rule = register.get_decomp_rule(op_name)
             lower = decom_rule and op_filter(op)
 
+            if op.name() == "builtin.combine":
+                temp_op = op
+                temp_inputs = _prepare_python_api_arguments(op)
+
             if lower:
                 core.prim_config["composite_ops_record"].add(op_name)
-                input_args = _prepare_python_api_arguments(op)
-                pir.set_insertion_point(op)
+                if (
+                    temp_op is not None
+                    and ops_list[idx - 1].name() == "builtin.combine"
+                ):
+                    input_args = temp_inputs
+                    pir.set_insertion_point(temp_op)
+                else:
+                    input_args = _prepare_python_api_arguments(op)
+                    pir.set_insertion_point(op)
                 orig_outs = op.results()
                 new_outs = _build_tensor_tuple(decom_rule(*input_args))
 
@@ -217,6 +234,16 @@ def _decompose_subgraph(block, orig_vars, dst_vars, op_filter):
 
                 op.replace_all_uses_with(new_outs)
                 block.remove_op(op)
+
+                if temp_op is not None:
+                    remove_op = True
+                    for item in temp_op.results():
+                        if item.has_one_use():
+                            remove_op = False
+                            break
+                    if remove_op:
+                        block.remove_op(temp_op)
+                    temp_op = None
         return
 
     elif isinstance(block, typing.Sequence):
diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py
index 173e9fe87f23f..544600f4b49ff 100644
--- a/python/paddle/decomposition/rules.py
+++ b/python/paddle/decomposition/rules.py
@@ -143,3 +143,11 @@ def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis):
     if is_amp:
         out = cast(out, dtype)
     return out, mean_, variance
+
+
+@register_decomp('pd_op.add_n')
+def sum_composite(x):
+    ans = x[0]
+    for xi in x[1:]:
+        ans = xi + ans
+    return ans
diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py
index ba73811e68ee9..5bd7216b54b62 100644
--- a/python/paddle/jit/dy2static/newir_partial_program.py
+++ b/python/paddle/jit/dy2static/newir_partial_program.py
@@ -958,14 +958,8 @@ def create_out(var_id):
             else:
                 tensor_type = paddle.dtype(8)  # SELECT ROW TENSOR
 
-            # TODO(xiongkun): more elegent way to do it.
-
-            ir_dtype_2_tensor_dtype = {
-                10: paddle.dtype(5),
-            }
-
             out = core.eager.Tensor(
-                ir_dtype_2_tensor_dtype[int(var.dtype)],
+                framework.paddle_type_to_proto_type[var.dtype],
                 var.shape,
                 "",
                 tensor_type,
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 5f8756ce7f150..42d0049f8a368 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -1495,26 +1495,30 @@ def before_append_backward(self, forward_program, src_vars):
                 dst_vars = decomposition.decompose(
                     forward_program, src_vars, blacklist=self.custom_vjps
                 )
-            return forward_program, dst_vars
+                return forward_program, dst_vars
+            return forward_program, src_vars
 
     def after_append_backward(self, whole_program, src_vars, forward_end_idx):
         with backend_guard(self.backend):
-            backward_length = (
-                len(whole_program.global_block().ops) - forward_end_idx
-            )
             if core._is_fwd_prim_enabled() and len(self.custom_vjps) != 0:
-                # only process backward part of block
-                dst_vars = decomposition.decompose(whole_program, src_vars)
-            new_start_index = (
-                len(whole_program.global_block().ops) - backward_length
-            )
-            return whole_program, new_start_index, dst_vars
+                backward_length = (
+                    len(whole_program.global_block().ops) - forward_end_idx
+                )
+                dst_vars = decomposition.decompose(
+                    whole_program, src_vars, whitelist=self.custom_vjps
+                )
+                new_start_index = (
+                    len(whole_program.global_block().ops) - backward_length
+                )
+                return whole_program, new_start_index, dst_vars
+            return whole_program, forward_end_idx, src_vars
 
     def after_infer(self, infer_program, src_vars):
         with backend_guard(self.backend):
             if core._is_fwd_prim_enabled():
                 dst_vars = decomposition.decompose(infer_program, src_vars)
-            return infer_program, dst_vars
+                return infer_program, dst_vars
+            return infer_program, src_vars
 
 
 class ProgramCache:
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index c154625fb51f4..40b78711bf789 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -62,6 +62,7 @@ def test_check_output(self):
             check_prim=True,
             check_cinn=True,
             check_new_ir=True,
+            check_prim_pir=True,
         )
 
     def test_check_grad(self):
@@ -70,8 +71,8 @@ def test_check_grad(self):
             'Out',
             check_prim=True,
             check_cinn=True,
-            check_prim_pir=True,
             check_new_ir=True,
+            check_prim_pir=True,
         )
 
 
diff --git a/test/prim/CMakeLists.txt b/test/prim/CMakeLists.txt
index 867a7552763bb..05c3fcf2de154 100644
--- a/test/prim/CMakeLists.txt
+++ b/test/prim/CMakeLists.txt
@@ -12,4 +12,4 @@ add_subdirectory(prim)
 add_subdirectory(model)
 add_subdirectory(composite_ops)
 add_subdirectory(process)
-add_subdirectory(new_ir_prim)
+add_subdirectory(pir_prim)
diff --git a/test/prim/new_ir_prim/CMakeLists.txt b/test/prim/pir_prim/CMakeLists.txt
similarity index 82%
rename from test/prim/new_ir_prim/CMakeLists.txt
rename to test/prim/pir_prim/CMakeLists.txt
index e1cbcd60f8ee4..c31e7254ff60c 100644
--- a/test/prim/new_ir_prim/CMakeLists.txt
+++ b/test/prim/pir_prim/CMakeLists.txt
@@ -1,5 +1,6 @@
-set(TEST_PRIM_PURE_NEW_IR_CASES test_prim_program test_prim_simpnet
-                                test_prim_custom_vjp test_prim_jit)
+set(TEST_PRIM_PURE_NEW_IR_CASES
+    test_prim_program test_prim_simpnet test_prim_custom_vjp test_prim_jit
+    test_pir_prim_flags)
 
 foreach(target ${TEST_PRIM_PURE_NEW_IR_CASES})
   py_test_modules(${target} MODULES ${target} ENVS GLOG_v=1
diff --git a/test/prim/new_ir_prim/test_custom_vjp_trait.py b/test/prim/pir_prim/test_custom_vjp_trait.py
similarity index 100%
rename from test/prim/new_ir_prim/test_custom_vjp_trait.py
rename to test/prim/pir_prim/test_custom_vjp_trait.py
diff --git a/test/prim/new_ir_prim/test_decomp_op.py b/test/prim/pir_prim/test_decomp_op.py
similarity index 100%
rename from test/prim/new_ir_prim/test_decomp_op.py
rename to test/prim/pir_prim/test_decomp_op.py
diff --git a/test/prim/pir_prim/test_pir_prim_flags.py b/test/prim/pir_prim/test_pir_prim_flags.py
new file mode 100644
index 0000000000000..4bee4da74a4d1
--- /dev/null
+++ b/test/prim/pir_prim/test_pir_prim_flags.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.base import core
+from paddle.decomposition import decompose
+
+
+class TestPrimBlacklistFlags(unittest.TestCase):
+    def not_in_blacklist(self):
+        inputs = np.random.random([2, 3, 4]).astype("float32")
+        paddle.enable_static()
+        core._set_prim_forward_enabled(True)
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(
+                'x', shape=inputs.shape, dtype=str(inputs.dtype)
+            )
+            y = F.gelu(x)
+
+            fwd_ops = [op.name() for op in main_program.global_block().ops]
+            # Ensure that tanh in original block
+            self.assertTrue('pd_op.gelu' in fwd_ops)
+
+            [y] = decompose(main_program, [y])
+
+            fwd_ops_new = [op.name() for op in main_program.global_block().ops]
+            # Ensure that tanh is splitted into small ops
+            self.assertTrue('pd_op.gelu' not in fwd_ops_new)
+
+        exe = paddle.static.Executor()
+        exe.run(startup_program)
+        _ = exe.run(main_program, feed={'x': inputs}, fetch_list=[y])
+        paddle.disable_static()
+        core._set_prim_forward_enabled(False)
+
+    def in_blacklist(self):
+        inputs = np.random.random([2, 3, 4]).astype("float32")
+        paddle.enable_static()
+        core._set_prim_forward_enabled(True)
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(
+                'x', shape=inputs.shape, dtype=str(inputs.dtype)
+            )
+            y = F.gelu(x)
+
+            fwd_ops = [op.name() for op in main_program.global_block().ops]
+            # Ensure that tanh in original block
+            self.assertTrue('pd_op.gelu' in fwd_ops)
+
+            _ = decompose(main_program, [y])
+
+            fwd_ops_new = [op.name() for op in main_program.global_block().ops]
+            # Ensure that tanh is splitted into small ops
+            self.assertTrue('pd_op.gelu' in fwd_ops_new)
+
+        exe = paddle.static.Executor()
+        exe.run(startup_program)
+        _ = exe.run(main_program, feed={'x': inputs}, fetch_list=[y])
+        paddle.disable_static()
+        core._set_prim_forward_enabled(False)
+
+    def test_prim_forward_blacklist(self):
+        self.not_in_blacklist()
+        core._set_prim_forward_blacklist("pd_op.gelu")
+        self.in_blacklist()
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x1 = paddle.tanh(x)
+        x2 = paddle.exp(x)
+        x3 = x1 + x2
+        res = paddle.nn.functional.gelu(x3)
+        return res
+
+
+class TestPrimBackwardBlacklistFlags(unittest.TestCase):
+    def train(self):
+        x = paddle.randn([2, 4])
+        x.stop_gradient = False
+        net = PrimeNet()
+        net = paddle.jit.to_static(net)
+        out = net(x)
+        loss = paddle.mean(out)
+        loss.backward()
+        self.check_prim(net)
+
+    def check_prim(self, net):
+        block = net.forward.program_cache.last()[-1][
+            -1
+        ].train_program.global_block()
+        ops = [op.name() for op in block.ops]
+        self.assertTrue('pd_op.tanh_grad' in ops)
+        self.assertTrue('pd_op.exp_grad' in ops)
+        self.assertTrue('pd_op.gelu_grad' not in ops)
+
+    def test_prim_backward_blacklist(self):
+        core._set_prim_all_enabled(True)
+        core._set_prim_backward_blacklist("tanh_grad", "exp_grad")
+        self.train()
+        core._set_prim_all_enabled(False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/prim/new_ir_prim/test_prim_custom_vjp.py b/test/prim/pir_prim/test_prim_custom_vjp.py
similarity index 100%
rename from test/prim/new_ir_prim/test_prim_custom_vjp.py
rename to test/prim/pir_prim/test_prim_custom_vjp.py
diff --git a/test/prim/new_ir_prim/test_prim_jit.py b/test/prim/pir_prim/test_prim_jit.py
similarity index 100%
rename from test/prim/new_ir_prim/test_prim_jit.py
rename to test/prim/pir_prim/test_prim_jit.py
diff --git a/test/prim/new_ir_prim/test_prim_program.py b/test/prim/pir_prim/test_prim_program.py
similarity index 100%
rename from test/prim/new_ir_prim/test_prim_program.py
rename to test/prim/pir_prim/test_prim_program.py
diff --git a/test/prim/new_ir_prim/test_prim_simpnet.py b/test/prim/pir_prim/test_prim_simpnet.py
similarity index 100%
rename from test/prim/new_ir_prim/test_prim_simpnet.py
rename to test/prim/pir_prim/test_prim_simpnet.py
diff --git a/test/prim/new_ir_prim/test_vjp_prim.py b/test/prim/pir_prim/test_vjp_prim.py
similarity index 100%
rename from test/prim/new_ir_prim/test_vjp_prim.py
rename to test/prim/pir_prim/test_vjp_prim.py

From 9a70032ab64e9081c2facbc14ef357302af7d3da Mon Sep 17 00:00:00 2001
From: Ruibin Cheung <beinggod@foxmail.com>
Date: Mon, 25 Sep 2023 11:37:37 +0800
Subject: [PATCH 096/115] [PIR] Migrate paddle.bitwise_and into pir (#57677)

---
 python/paddle/tensor/logic.py       | 4 ++--
 test/legacy_test/test_bitwise_op.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 7d20accc5a43c..e0d051ea509f5 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -1155,7 +1155,7 @@ def bitwise_and(x, y, out=None, name=None):
             Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
             [0, 2, 1])
     """
-    if in_dynamic_mode() and out is None:
+    if in_dynamic_or_pir_mode() and out is None:
         return _C_ops.bitwise_and(x, y)
     return _bitwise_op(
         op_name="bitwise_and", x=x, y=y, name=name, out=out, binary_op=True
@@ -1175,7 +1175,7 @@ def bitwise_and_(x, y, name=None):
                 out_shape, x.shape
             )
         )
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.bitwise_and_(x, y)
 
 
diff --git a/test/legacy_test/test_bitwise_op.py b/test/legacy_test/test_bitwise_op.py
index 7f60c00901f92..a5040b434b260 100644
--- a/test/legacy_test/test_bitwise_op.py
+++ b/test/legacy_test/test_bitwise_op.py
@@ -43,7 +43,7 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output(check_cinn=True)
+        self.check_output(check_cinn=True, check_new_ir=True)
 
     def test_check_grad(self):
         pass

From 50bb3e17ca902a951da846bb5714cfcf1f31d209 Mon Sep 17 00:00:00 2001
From: ming1753 <61511741+ming1753@users.noreply.github.com>
Date: Mon, 25 Sep 2023 11:40:21 +0800
Subject: [PATCH 097/115] fix RunWithExternalStream contex switch bug (#57629)

* fix RunWithExternalStream contex switch bug
---
 .../fluid/inference/api/analysis_predictor.cc |  3 +
 paddle/phi/api/include/context_pool.h         |  2 +
 paddle/phi/api/lib/context_pool.cc            | 12 ++++
 .../api/analysis_predictor_tester.cc          | 55 +++++++++++++++++++
 4 files changed, 72 insertions(+)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 70da22a3240e9..f30e2c560b57f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -56,6 +56,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
@@ -2219,6 +2220,8 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
           UpdatePrivateDeviceContext(gpu_context, gpu_resource, place_);
           return std::unique_ptr<phi::DeviceContext>(gpu_context);
         }));
+    auto &pool = paddle::experimental::DeviceContextPool::Instance();
+    pool.SyncDeviceContext(place_);
   }
 
   return ZeroCopyRun();
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index 7afe17ba8419d..6b6fe290d6d28 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -71,6 +71,8 @@ class PADDLE_API DeviceContextPool {
 
   phi::DeviceContext* GetMutable(const Place& place);
 
+  void SyncDeviceContext(const Place& place);
+
   template <AllocationType T>
   const typename DefaultDeviceContextType<T>::TYPE* Get(const Place& place) {
     return reinterpret_cast<const typename DefaultDeviceContextType<T>::TYPE*>(
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index 292bd8a7e47aa..8066147025117 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -26,6 +26,18 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
+void DeviceContextPool::SyncDeviceContext(const Place& place) {
+  if (!phi::DeviceContextPool::IsInitialized()) {
+    phi::memory_utils::InitDevices();
+  }
+  // only when we need the specific DeviceContext, get and cache it
+  auto* dev_ctx = phi::DeviceContextPool::Instance().Get(place);
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    context_map_[place] = dev_ctx;
+  }
+}
+
 DeviceContextPool& DeviceContextPool::Instance() {
   static DeviceContextPool g_device_context_pool;
   return g_device_context_pool;
diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
index 35c07c3a83790..f32d509d62d8b 100644
--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -663,6 +663,61 @@ TEST(Predictor, Streams) {
     CHECK_NE(stream, stream2);
   }
 }
+
+TEST(Tensor, RunWithExternalStream) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+  config.EnableUseGpu(100, 0);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  config.SetExecStream(stream);
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  std::vector<std::vector<int64_t>> input_data(4, {0, 1, 2, 3});
+  std::vector<int64_t*> input_gpu(4, nullptr);
+
+  for (size_t i = 0; i < 4; ++i) {
+    cudaMalloc(reinterpret_cast<void**>(&input_gpu[i]), 4 * sizeof(int64_t));
+    cudaMemcpy(input_gpu[i],
+               input_data[i].data(),
+               4 * sizeof(int64_t),
+               cudaMemcpyHostToDevice);
+  }
+
+  w0->ShareExternalData<int64_t>(input_gpu[0], {4, 1}, PlaceType::kGPU);
+  w1->ShareExternalData<int64_t>(input_gpu[1], {4, 1}, PlaceType::kGPU);
+  w2->ShareExternalData<int64_t>(input_gpu[2], {4, 1}, PlaceType::kGPU);
+  w3->ShareExternalData<int64_t>(input_gpu[3], {4, 1}, PlaceType::kGPU);
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  auto out_shape = out->shape();
+  float* out_data = nullptr;
+  auto out_size =
+      std::accumulate(
+          out_shape.begin(), out_shape.end(), 1, std::multiplies<int>()) *
+      sizeof(float);
+  cudaMalloc(reinterpret_cast<void**>(out_data), out_size * sizeof(float));
+  out->ShareExternalData<float>(out_data, out_shape, PlaceType::kGPU);
+
+  cudaStream_t external_stream;
+  cudaStreamCreate(&external_stream);
+  Config tmp_config(config);
+  tmp_config.SetExecStream(external_stream);
+  predictor->Run();
+  paddle_infer::experimental::InternalUtils::RunWithExternalStream(
+      predictor.get(), external_stream);
+
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->TryShrinkMemory();
+}
 #endif
 
 TEST(AnalysisPredictor, OutputTensorHookFunc) {

From 395ffbfa7157a8d588c33c0a731184146d5bc024 Mon Sep 17 00:00:00 2001
From: yangguohao <70266361+yangguohao@users.noreply.github.com>
Date: Mon, 25 Sep 2023 11:59:59 +0800
Subject: [PATCH 098/115] =?UTF-8?q?=E3=80=90Complex=20OP=E3=80=91No.30=20c?=
 =?UTF-8?q?omplex=20stanh=20op=20(#57639)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../phi/kernels/cpu/activation_grad_kernel.cc |  2 +-
 paddle/phi/kernels/cpu/activation_kernel.cc   |  2 +-
 paddle/phi/kernels/funcs/activation_functor.h | 52 +++++++++++++++++++
 .../phi/kernels/gpu/activation_grad_kernel.cu |  2 +-
 paddle/phi/kernels/gpu/activation_kernel.cu   |  2 +-
 test/legacy_test/test_activation_op.py        | 18 ++++++-
 6 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index be44d968548f4..ee3e2b6b39e8b 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -303,7 +303,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(silu_grad, SiluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(rsqrt_grad, RsqrtGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index a2daac870c63e..9bee7c9f11365 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -197,7 +197,7 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(silu, SiluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(stanh, STanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, STanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index e5aaf1aeb8d34..f186d35428350 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -599,6 +599,32 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct STanhGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<ComplexType<T>>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    auto a = static_cast<ComplexType<T>>(scale_a);  // NOLINT
+    auto b = static_cast<ComplexType<T>>(scale_b);
+    auto temp = (a * x).tanh() * (a * x).tanh();
+    dx.device(d) =
+        dout *
+        (a * b * (static_cast<ComplexType<T>>(1) - temp)).unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct Tangent {
   HOSTDEVICE T operator()(const T& val) const { return tan(val); }
@@ -3578,6 +3604,32 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaSTanhGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  ComplexType<T> one = static_cast<ComplexType<T>>(1.0f);
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> arg_dout, const ComplexType<T> arg_x) const {
+    ComplexType<T> dout = static_cast<ComplexType<T>>(arg_dout);
+    ComplexType<T> x = static_cast<ComplexType<T>>(arg_x);
+    ComplexType<T> a = static_cast<ComplexType<T>>(scale_a);
+    ComplexType<T> b = static_cast<ComplexType<T>>(scale_b);
+    ComplexType<T> temp = tanh(a * x);
+    return static_cast<ComplexType<T>>(dout *
+                                       conj(a * b * (one - temp * temp)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 36d7d3ae1baf8..ff1552370a55c 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -379,7 +379,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                    ThresholdedReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(stanh_grad, STanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_grad, SoftplusGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softplus_double_grad,
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index a506415d36bab..a14f32599552a 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -246,7 +246,7 @@ PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(stanh, StanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, StanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 915a10fdc180b..caf2390d3fec7 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -3694,7 +3694,13 @@ def setUp(self):
         scale_b = self.get_scale_b()
 
         np.random.seed(1024)
-        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        if self.dtype is np.complex64 or self.dtype is np.complex128:
+            x = (
+                np.random.uniform(0.1, 1, self.shape)
+                + 1j * np.random.uniform(0.1, 1, self.shape)
+            ).astype(self.dtype)
+        else:
+            x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
         # The same reason with TestAbs
         out = ref_stanh(x, scale_a, scale_b)
 
@@ -3724,6 +3730,16 @@ def init_shape(self):
         self.shape = []
 
 
+class TestSTanhComplex64(TestSTanh):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestSTanhComplex128(TestSTanh):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestSTanhAPI(unittest.TestCase):
     # test paddle.nn.stanh
     def get_scale_a(self):

From 94b6d70a5cdc82076a54a96876ac73947a8a3079 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Mon, 25 Sep 2023 14:25:47 +0800
Subject: [PATCH 099/115] [PIR]Perfect unittest of reshape, scale, erf,
 greater_equal (#57593)

* perfect unittest

* fix windows bugs

* fix api benchmark

* fix doc
---
 .../pir/dialect/op_generator/op_build_gen.py  |   1 +
 paddle/fluid/pybind/eager_utils.cc            |   2 +-
 python/paddle/tensor/manipulation.py          | 102 ++++++++++--------
 python/paddle/utils/layers_utils.py           |   8 +-
 test/legacy_test/test_compare_op.py           |  10 +-
 test/legacy_test/test_erf_op.py               |  10 +-
 test/legacy_test/test_reshape_op.py           |  33 ++++--
 test/legacy_test/test_scale_op.py             |  16 ++-
 8 files changed, 105 insertions(+), 77 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index ae2a60b8b866c..e24902c712c1a 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -19,6 +19,7 @@
     'SplitWithNumInferMeta',
     'ConcatInferMeta',
     'ReduceIntArrayAxisInferMeta',
+    'ReshapeWithXShapeInferMeta',
     'SliceRawInferMeta',
 }
 
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 1c32c8e9b6a94..1045b1c660884 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1584,7 +1584,7 @@ std::vector<pir::Value> CastPyArg2VectorOfValue(PyObject* obj,
       }
     }
   } else if (PyObject_TypeCheck(obj, g_ir_opresult_pytype)) {
-    return {::pybind11::handle(obj).cast<pir::Value>()};
+    return {::pybind11::handle(obj).cast<pir::OpResult>()};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 933ea0a7651a0..46849cbde953d 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3588,28 +3588,79 @@ def reshape(x, shape, name=None):
             # the value is [10.]
 
     """
-    if in_dynamic_or_pir_mode():
+
+    def get_attr_shape(list_shape):
+        unk_dim_idx = -1
+        attrs_shape = []
+        for dim_idx, dim_size in enumerate(list_shape):
+            if isinstance(dim_size, (Variable, paddle.pir.OpResult)):
+                attrs_shape.append(-1)
+            else:
+                attrs_shape.append(dim_size)
+                if dim_size == -1:
+                    assert unk_dim_idx == -1, (
+                        "Only one dimension value of 'shape' in reshape can "
+                        "be -1. But received shape[%d] is also -1.\n"
+                        "\n\t# N = x.shape()[2]\t\t# N is an int. "
+                        "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t"
+                        "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])"
+                        "\t# z.shape is [-1, -1, 4]\n\n"
+                        "    If your target shape in Reshape represents dynamic shape, "
+                        "please turn it into a Tensor under @to_static. See above example for details."
+                        % dim_idx
+                    )
+                    unk_dim_idx = dim_idx
+                elif dim_size == 0:
+                    assert dim_idx < len(x.shape), (
+                        "The index of 0 in `shape` must be less than "
+                        "the input tensor X's dimensions. "
+                        "But received shape[%d] = 0, X's dimensions = %d."
+                        % (dim_idx, len(x.shape))
+                    )
+                else:
+                    assert dim_size > 0, (
+                        "Each dimension value of 'shape' in reshape must not "
+                        "be negative except one unknown dimension. "
+                        "But received shape[%d] = %s."
+                        % (dim_idx, str(dim_size))
+                    )
+        return attrs_shape
+
+    if in_dynamic_mode():
         if isinstance(shape, (list, tuple)):
             new_shape = []
             for ele in shape:
                 if isinstance(ele, core.eager.Tensor):
                     new_shape.append(ele.item())
-                elif isinstance(ele, paddle.pir.OpResult):
-                    new_shape.append(-1)
                 else:
                     new_shape.append(ele)
-
             if new_shape == x.shape:
                 out = x
             else:
                 out = _C_ops.reshape(x, new_shape)
-        elif isinstance(shape, (core.eager.Tensor, paddle.pir.OpResult)):
+        elif isinstance(shape, core.eager.Tensor):
+            shape.stop_gradient = True
+            out = _C_ops.reshape(x, shape)
+        else:
+            raise ValueError(
+                "shape must be an instance of `list`, `tuple` `Variable`,"
+                f" got '{type(shape)}.'"
+            )
+        return out
+    elif in_pir_mode():
+        if isinstance(shape, (list, tuple)):
+            if paddle.utils._contain_var(shape):
+                new_shape = paddle.utils._convert_to_tensor_list(shape)
+            else:
+                new_shape = get_attr_shape(shape)
+            out = _C_ops.reshape(x, new_shape)
+        elif isinstance(shape, paddle.pir.OpResult):
             shape.stop_gradient = True
             out = _C_ops.reshape(x, shape)
         else:
             raise ValueError(
-                "shape must be an instance of `list`, `tuple` `Variable(in dygraph mode)` or `OpResult(in pir mode)`,"
-                " got '{}.'".format(type(shape))
+                "shape must be an instance of `list`, `tuple` `OpResult(in pir mode)`,"
+                f" got '{type(shape)}.'"
             )
 
         return out
@@ -3631,43 +3682,6 @@ def reshape(x, shape, name=None):
         )
         check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
 
-        def get_attr_shape(list_shape):
-            unk_dim_idx = -1
-            attrs_shape = []
-            for dim_idx, dim_size in enumerate(list_shape):
-                if isinstance(dim_size, Variable):
-                    attrs_shape.append(-1)
-                else:
-                    attrs_shape.append(dim_size)
-                    if dim_size == -1:
-                        assert unk_dim_idx == -1, (
-                            "Only one dimension value of 'shape' in reshape can "
-                            "be -1. But received shape[%d] is also -1.\n"
-                            "\n\t# N = x.shape()[2]\t\t# N is an int. "
-                            "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t"
-                            "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])"
-                            "\t# z.shape is [-1, -1, 4]\n\n"
-                            "    If your target shape in Reshape represents dynamic shape, "
-                            "please turn it into a Tensor under @to_static. See above example for details."
-                            % dim_idx
-                        )
-                        unk_dim_idx = dim_idx
-                    elif dim_size == 0:
-                        assert dim_idx < len(x.shape), (
-                            "The index of 0 in `shape` must be less than "
-                            "the input tensor X's dimensions. "
-                            "But received shape[%d] = 0, X's dimensions = %d."
-                            % (dim_idx, len(x.shape))
-                        )
-                    else:
-                        assert dim_size > 0, (
-                            "Each dimension value of 'shape' in reshape must not "
-                            "be negative except one unknown dimension. "
-                            "But received shape[%d] = %s."
-                            % (dim_idx, str(dim_size))
-                        )
-            return attrs_shape
-
         inputs = {"X": x}
         attrs = {}
         if isinstance(shape, Variable):
diff --git a/python/paddle/utils/layers_utils.py b/python/paddle/utils/layers_utils.py
index c49cb7bc42239..d6de149dbd148 100644
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
@@ -370,10 +370,10 @@ def _is_symmetric_padding(padding, data_dim):
 
 def _contain_var(list_or_tuple):
     """
-    Check whether list or tuple contains variable.
+    Check whether list or tuple contains variable / OpResult.
     """
     for item in list_or_tuple:
-        if isinstance(item, Variable):
+        if isinstance(item, (Variable, paddle.pir.OpResult)):
             return True
     return False
 
@@ -432,13 +432,13 @@ def _get_shape_tensor(list_shape):
 
 def _convert_to_tensor_list(old_list, dtype="int32"):
     """
-    Converts all elements of a list to Variable.
+    Converts all elements of a list to Variable / OpResult.
     """
     from paddle.tensor import fill_constant
 
     new_list_tensor = []
     for ele in old_list:
-        if isinstance(ele, Variable):
+        if isinstance(ele, (Variable, paddle.pir.OpResult)):
             ele.stop_gradient = True
             new_list_tensor.append(ele)
         else:
diff --git a/test/legacy_test/test_compare_op.py b/test/legacy_test/test_compare_op.py
index fbf1f05da94f3..3d29e25248554 100755
--- a/test/legacy_test/test_compare_op.py
+++ b/test/legacy_test/test_compare_op.py
@@ -61,7 +61,9 @@ def test_errors(self):
     create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
     create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
     create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
-    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
+    create_test_class(
+        'greater_equal', _type_name, lambda _a, _b: _a >= _b, True
+    )
     create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
     create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
 
@@ -443,7 +445,7 @@ def test_attr_name(self):
 
 
 # add bf16 tests
-def create_bf16_case(op_type, callback):
+def create_bf16_case(op_type, callback, check_new_ir=False):
     class TestCompareOpBF16Op(op_test.OpTest):
         def setUp(self):
             self.op_type = op_type
@@ -460,7 +462,7 @@ def setUp(self):
             self.outputs = {'Out': real_result}
 
         def test_check_output(self):
-            self.check_output(check_cinn=True)
+            self.check_output(check_cinn=True, check_new_ir=check_new_ir)
 
     cls_name = f"BF16TestCase_{op_type}"
     TestCompareOpBF16Op.__name__ = cls_name
@@ -470,7 +472,7 @@ def test_check_output(self):
 create_bf16_case('less_than', lambda _a, _b: _a < _b)
 create_bf16_case('less_equal', lambda _a, _b: _a <= _b)
 create_bf16_case('greater_than', lambda _a, _b: _a > _b)
-create_bf16_case('greater_equal', lambda _a, _b: _a >= _b)
+create_bf16_case('greater_equal', lambda _a, _b: _a >= _b, True)
 create_bf16_case('equal', lambda _a, _b: _a == _b)
 create_bf16_case('not_equal', lambda _a, _b: _a != _b)
 
diff --git a/test/legacy_test/test_erf_op.py b/test/legacy_test/test_erf_op.py
index ee86ed155470e..24f32175151d6 100644
--- a/test/legacy_test/test_erf_op.py
+++ b/test/legacy_test/test_erf_op.py
@@ -93,10 +93,10 @@ def setUp(self):
         self.outputs = {'Out': y_ref}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 @unittest.skipIf(
@@ -121,11 +121,13 @@ def setUp(self):
 
     def test_check_output(self):
         place = paddle.base.core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_new_ir=True)
 
     def test_check_grad(self):
         place = paddle.base.core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, check_new_ir=True
+        )
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
index 0a9132ca55b49..748db17342be2 100755
--- a/test/legacy_test/test_reshape_op.py
+++ b/test/legacy_test/test_reshape_op.py
@@ -123,10 +123,16 @@ def init_data(self):
         self.infered_shape = (12, 10)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
+        self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True, check_prim_pir=True)
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestReshapeFP16Op(OpTest):
@@ -151,10 +157,16 @@ def init_data(self):
         self.infered_shape = (12, 10)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
+        self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", check_prim=True, check_prim_pir=True)
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_prim=True,
+            check_prim_pir=True,
+            check_new_ir=True,
+        )
 
 
 class TestReshapeOpDimInfer1(TestReshapeOp):
@@ -195,10 +207,10 @@ def init_data(self):
         self.actual_shape = (2, 3, 20)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
+        self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_new_ir=True)
 
 
 # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
@@ -232,10 +244,10 @@ def init_data(self):
         self.shape = (-1, -1)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
+        self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_new_ir=True)
 
 
 class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
@@ -278,10 +290,10 @@ def init_data(self):
         self.infered_shape = (10, 10)
 
     def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
+        self.check_output(no_check_set=['XShape'], check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_new_ir=True)
 
 
 class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
@@ -334,6 +346,7 @@ def test_check_output(self):
             base.core.CPUPlace(),
             atol=1e-5,
             no_check_set=['XShape'],
+            check_new_ir=True,
         )
 
     def test_check_grad(self):
diff --git a/test/legacy_test/test_scale_op.py b/test/legacy_test/test_scale_op.py
index 812a926dfd393..a6cea49a2bce3 100644
--- a/test/legacy_test/test_scale_op.py
+++ b/test/legacy_test/test_scale_op.py
@@ -66,10 +66,10 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_cinn=True)
+        self.check_output(check_cinn=True, check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_new_ir=True)
 
 
 class TestScaleOpSelectedRows(unittest.TestCase):
@@ -150,10 +150,10 @@ def init_dtype_type(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output(check_cinn=True)
+        self.check_output(check_cinn=True, check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_new_ir=True)
 
 
 @unittest.skipIf(
@@ -172,14 +172,10 @@ def setUp(self):
         self.outputs = {'Out': convert_float_to_uint16(out)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            numeric_grad_delta=0.8,
-        )
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.8, check_new_ir=True)
 
 
 @unittest.skipIf(

From 1c5abfa9a00414fc0efdab8a9adbf68f042c395f Mon Sep 17 00:00:00 2001
From: yinwei <1871465933@qq.com>
Date: Mon, 25 Sep 2023 14:32:54 +0800
Subject: [PATCH 100/115] update sigmod_cross_entry_with_logits_*.cu for
 compiling optimize (#57631)

---
 .../kernels/gpu/sigmoid_cross_entropy_with_logits.h   | 11 -----------
 .../sigmoid_cross_entropy_with_logits_grad_kernel.cu  |  7 +++----
 .../gpu/sigmoid_cross_entropy_with_logits_kernel.cu   |  6 ++----
 3 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index 307b51a1ca119..dc6d8312e06c7 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -51,15 +51,4 @@ struct NonzeroFunctor {
   }
 };
 
-template <typename T>
-struct DivFunctor {
-  const T norm_;
-  HOSTDEVICE inline DivFunctor(const T norm) : norm_(norm) {}
-
-  HOSTDEVICE inline T operator()(T loss) {
-    loss /= norm_;
-    return loss;
-  }
-};
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
index 7a70e74b41e8e..a6e627a5fb4bf 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace phi {
 
@@ -144,10 +145,8 @@ void SigmoidCrossEntropyWithLogitsGradKernel(
     auto eps = static_cast<T>(1e-5);
     *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
 
-    std::vector<const DenseTensor *> div_ins = {in_grad};
-    std::vector<DenseTensor *> div_outs = {in_grad};
-    auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
-    phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+    phi::ScaleKernel<T>(
+        dev_ctx, *in_grad, (1.0 / *norm_cpu_ptr), 0.0f, false, in_grad);
 
     delete norm_tensor;
   }
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
index dcad2bdbc7804..966c85506a128 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"
+#include "paddle/phi/kernels/scale_kernel.h"
 
 namespace phi {
 
@@ -146,10 +147,7 @@ void SigmoidCrossEntropyWithLogitsKernel(
     auto eps = static_cast<T>(1e-5);
     *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;
 
-    std::vector<const DenseTensor *> div_ins = {out};
-    std::vector<DenseTensor *> div_outs = {out};
-    auto div_functor = DivFunctor<T>(*norm_cpu_ptr);
-    phi::funcs::ElementwiseKernel<T>(dev_ctx, div_ins, &div_outs, div_functor);
+    phi::ScaleKernel<T>(dev_ctx, *out, 1.0 / (*norm_cpu_ptr), 0.0f, false, out);
 
     delete norm_tensor;
   }

From f53a032b3cb6b22dd5f83d75081454568380f5b9 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Mon, 25 Sep 2023 14:33:57 +0800
Subject: [PATCH 101/115] modify unsqueeze (#57681)

---
 .../dialect/op_generator/vjp_interface_gen_op_list.py  |  4 ++++
 paddle/fluid/primitive/codegen/gen.py                  |  2 ++
 paddle/phi/api/yaml/backward.yaml                      | 10 +++++-----
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
index b00c67aa06b24..c70a87e826b77 100644
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
@@ -60,6 +60,8 @@
     'stack',
     'poisson',
     'gumbel_softmax',
+    'squeeze',
+    'unsqueeze',
     'tril',
     'triu',
 ]
@@ -100,6 +102,8 @@
     'stack',
     'poisson',
     'gumbel_softmax',
+    'squeeze',
+    'unsqueeze',
     'tril',
     'triu',
 ]
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index b67f3e83de952..0d1f444df2bef 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -78,6 +78,8 @@
     'unsqueeze_grad',
     'poisson_grad',
     'gumbel_softmax_grad',
+    'squeeze_grad',
+    'unsqueeze_grad',
 ]
 
 
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 66f5056320950..b9695e2909b11 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -2494,15 +2494,15 @@
   inplace : (out_grad -> x_grad)
 
 - backward_op : unsqueeze_double_grad
-  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
-  args : (Tensor grad_x_grad, IntArray axes)
+  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray axis)
   output : Tensor(grad_out_grad), Tensor(xshape)
-  invoke : unsqueeze(grad_x_grad, axes)
+  invoke : unsqueeze(grad_x_grad, axis)
   intermediate : xshape
 
 - backward_op : unsqueeze_grad
-  forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
-  args : (Tensor xshape, Tensor out_grad, IntArray axes)
+  forward : unsqueeze(Tensor x, IntArray axis) -> Tensor(out), Tensor(xshape)
+  args : (Tensor xshape, Tensor out_grad, IntArray axis)
   output : Tensor(x_grad)
   infer_meta :
     func : KernelWithXShapeInferMeta

From 390ec493d9c4d7cf643f00fa66021cf8fb80b467 Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Mon, 25 Sep 2023 15:01:28 +0800
Subject: [PATCH 102/115] [Semi-Auto] Adapt reshape spmd rule to phi (#57573)

* adapt reshape rule spmd rule to phi

* fix the bug when op attribute is vector<int64_t> type

* add two more unit test cases
---
 .../spmd_rules/reshape_spmd_rule.h            |  41 -----
 .../auto_parallel/spmd_rules/rules.h          |   4 -
 .../auto_parallel/inferspmd_utils.cc          |  23 ++-
 .../infermeta}/spmd_rules/dim_trans.cc        |  19 +-
 .../infermeta}/spmd_rules/dim_trans.h         |  11 +-
 .../infermeta/spmd_rules/reshape.cc}          | 165 +++++++++---------
 paddle/phi/infermeta/spmd_rules/reshape.h     |  32 ++++
 paddle/phi/infermeta/spmd_rules/rules.h       |   6 +
 .../spmd_rules/test_reshape_rule.py           | 121 ++++++++-----
 9 files changed, 235 insertions(+), 187 deletions(-)
 delete mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h
 rename paddle/{fluid/distributed/auto_parallel => phi/infermeta}/spmd_rules/dim_trans.cc (95%)
 rename paddle/{fluid/distributed/auto_parallel => phi/infermeta}/spmd_rules/dim_trans.h (94%)
 rename paddle/{fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.cc => phi/infermeta/spmd_rules/reshape.cc} (56%)
 create mode 100644 paddle/phi/infermeta/spmd_rules/reshape.h

diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h
deleted file mode 100644
index 737455e0be6c8..0000000000000
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iterator>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
-
-namespace paddle {
-namespace distributed {
-namespace auto_parallel {
-
-class ReshapeSPMDRule : public SPMDRuleBase {
- public:
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferForward(const std::vector<DistTensorSpec>& input_specs,
-               const paddle::framework::AttributeMap& attrs) override;
-
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferBackward(const std::vector<DistTensorSpec>& input_specs,
-                const std::vector<DistTensorSpec>& output_specs,
-                const paddle::framework::AttributeMap& attrs) override;
-};
-}  // namespace auto_parallel
-}  // namespace distributed
-}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
index 71f939ffd3785..46806ce4daab7 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h"
@@ -50,9 +49,6 @@ REGISTER_SPMD_RULE(split_with_num, SplitSPMDRule);
 // transpose rule
 REGISTER_SPMD_RULE(transpose, TransposeSPMDRule);
 
-// reshape rule
-REGISTER_SPMD_RULE(reshape, ReshapeSPMDRule);
-
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
index 24030b5d0ffa8..485e2f09a42e9 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
@@ -86,7 +86,28 @@ std::vector<int> InferSpmdContext::AttrAt(size_t idx) const {
   } catch (paddle::bad_variant_access const& e) {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "Attribute cast error in InferSpmd Context, the input attr type is "
-        "`%s`, but the expected attribute type is `bool`.",
+        "`%s`, but the expected attribute type is `std::vector<int>`.",
+        attrs_.at(idx).type().name()));
+  }
+}
+
+template <>
+std::vector<int64_t> InferSpmdContext::AttrAt(size_t idx) const {
+  try {
+    auto attr = attrs_.at(idx);
+    if (attr.type() == typeid(std::vector<bool>)) {
+      std::vector<bool> val = PADDLE_GET_CONST(std::vector<bool>, attr);
+      return std::vector<int64_t>(val.begin(), val.end());
+    } else if (attr.type() == typeid(std::vector<int>)) {
+      std::vector<int> val = PADDLE_GET_CONST(std::vector<int>, attr);
+      return std::vector<int64_t>(val.begin(), val.end());
+    } else {
+      return PADDLE_GET_CONST(std::vector<int64_t>, attr);
+    }
+  } catch (paddle::bad_variant_access const& e) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attribute cast error in InferSpmd Context, the input attr type is "
+        "`%s`, but the expected attribute type is `std::vector<int64_t>`.",
         attrs_.at(idx).type().name()));
   }
 }
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.cc b/paddle/phi/infermeta/spmd_rules/dim_trans.cc
similarity index 95%
rename from paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.cc
rename to paddle/phi/infermeta/spmd_rules/dim_trans.cc
index 56aab1ec6093f..d781cc415ae4c 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.cc
+++ b/paddle/phi/infermeta/spmd_rules/dim_trans.cc
@@ -12,17 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h"
+#include "paddle/phi/infermeta/spmd_rules/dim_trans.h"
 #include <assert.h>
 #include <cstdio>
 #include <numeric>
 #include <set>
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
 #include "paddle/phi/core/enforce.h"
 
-namespace paddle {
+namespace phi {
 namespace distributed {
-namespace auto_parallel {
 
 static std::vector<DimTrans*> all_dim_trans;
 
@@ -289,10 +288,11 @@ void GetUsedInputDim(DimTrans* dim_trans, std::set<int64_t>* seen_dims) {
 }
 
 std::vector<std::vector<int64_t>> InferFromDimTrans(
-    const DistTensorSpec& input_spec, const std::vector<DimTrans*>& dim_trans) {
-  const std::vector<int64_t>& input_shape = input_spec.shape();
-  const std::vector<int64_t>& input_dims_mapping = input_spec.dims_mapping();
-  const ProcessMesh& mesh = input_spec.dist_attr().process_mesh();
+    const DistMetaTensor& input, const std::vector<DimTrans*>& dim_trans) {
+  std::vector<int64_t> input_shape = phi::vectorize(input.dims());
+  const std::vector<int64_t>& input_dims_mapping =
+      input.dist_attr().dims_mapping();
+  const ProcessMesh& mesh = input.dist_attr().process_mesh();
   const std::vector<int64_t>& mesh_shape = mesh.shape();
 
   std::set<int64_t> sharded_input_dims;
@@ -354,6 +354,5 @@ std::vector<std::vector<int64_t>> InferFromDimTrans(
   return {new_input_dims_mapping, out_dims_mapping};
 }
 
-}  // namespace auto_parallel
 }  // namespace distributed
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h b/paddle/phi/infermeta/spmd_rules/dim_trans.h
similarity index 94%
rename from paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h
rename to paddle/phi/infermeta/spmd_rules/dim_trans.h
index f196a0266d5d4..58ce07d0095c1 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h
+++ b/paddle/phi/infermeta/spmd_rules/dim_trans.h
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <iostream>
 #include <vector>
 
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
 
-namespace paddle {
+namespace phi {
 namespace distributed {
-namespace auto_parallel {
 
 // This is a base class to describe how each dimension in output tensor
 // is transformed from input tensor's axes. The transformation includes
@@ -153,8 +153,7 @@ DimTrans* make_split(DimTrans* dim,
 // leftmost output split axis can be sharded when its shape can be divisible
 // by the mesh dimension.
 std::vector<std::vector<int64_t>> InferFromDimTrans(
-    const DistTensorSpec& input_spec, const std::vector<DimTrans*>& dim_trans);
+    const DistMetaTensor& input_spec, const std::vector<DimTrans*>& dim_trans);
 
-}  // namespace auto_parallel
 }  // namespace distributed
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.cc b/paddle/phi/infermeta/spmd_rules/reshape.cc
similarity index 56%
rename from paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.cc
rename to paddle/phi/infermeta/spmd_rules/reshape.cc
index 5e0c2c5a92c5b..4c95b846c87d0 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.cc
+++ b/paddle/phi/infermeta/spmd_rules/reshape.cc
@@ -12,14 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
+#include "paddle/phi/infermeta/spmd_rules/reshape.h"
 #include <numeric>
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dim_trans.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/dim_trans.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
 
-namespace paddle {
+namespace phi {
 namespace distributed {
-namespace auto_parallel {
 
 using phi::distributed::auto_parallel::str_join;
 
@@ -71,9 +76,9 @@ std::vector<DimTrans*> MakeReshapeDimTrans(
   std::vector<int64_t> inferred_tgt_shape =
       InferTargetShape(tgt_shape, total_elem_num_src);
 
-  int64_t src_idx = 0, tgt_idx = 0;
-  int64_t s, t;
-  int64_t src_len, tgt_len;
+  int src_idx = 0, tgt_idx = 0;
+  int s, t;
+  int src_len, tgt_len;
   src_len = static_cast<int64_t>(src_shape.size());
   tgt_len = static_cast<int64_t>(inferred_tgt_shape.size());
   while (src_idx < src_len || tgt_idx < tgt_len) {
@@ -135,29 +140,27 @@ std::vector<DimTrans*> MakeReshapeDimTrans(
   return ret;
 }
 
-//
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-paddle::distributed::auto_parallel::ReshapeSPMDRule::InferForward(
-    const std::vector<DistTensorSpec>& input_specs,
-    const paddle::framework::AttributeMap& attrs) {
-  // step0: Verify Input Args Based on Reshape Logic
-  int64_t ninputs = static_cast<int64_t>(input_specs.size());
+SpmdInfo ReshapeInferSpmd(const DistMetaTensor& x,
+                          const std::vector<int64_t>& shape) {
+  // Step0: Verify input args based on reshape logic
+  auto src_shape = phi::vectorize(x.dims());
+  int x_ndim = src_shape.size();
+  auto x_dist_attr_src = x.dist_attr();
+  std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
-      ninputs,
-      1,
-      phi::errors::InvalidArgument("The size of InputSpec in reshape must "
-                                   "be equal to 1, but got [%d].",
-                                   ninputs));
-  VerifySpecs(input_specs, "reshape");
-
-  // step1: build the transformation from
-  // original shape to target shape
-  std::vector<int64_t> src_shape = input_specs[0].shape();
-  std::vector<int64_t> tgt_shape =
-      ExtractAttr<std::vector<int64_t>>("shape", attrs);
+      x_ndim,
+      x_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor X's rank [%d] and X's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   x_ndim,
+                                   x_dims_mapping.size()));
+
+  // Step1: Build the transformation from
+  // the original shape to the target shape
 
   // handle the '0' values in target shape, '0' indicates
   // that the target shape is equal to the source shape
+  std::vector<int64_t> tgt_shape(shape);
   for (int64_t i = 0, n = static_cast<int64_t>(tgt_shape.size()); i < n; i++) {
     if (tgt_shape[i] == 0) {
       tgt_shape[i] = src_shape[i];
@@ -166,96 +169,88 @@ paddle::distributed::auto_parallel::ReshapeSPMDRule::InferForward(
 
   std::vector<DimTrans*> trans = MakeReshapeDimTrans(src_shape, tgt_shape);
 
-  // step2: infer the dims mapping of input (if reshard is
+  // Step2: Infer the dims mapping of input (if reshard is
   // needed) and output from the dimension transformation.
   std::vector<std::vector<int64_t>> dims_mapping_vec =
-      InferFromDimTrans(input_specs[0], trans);
+      InferFromDimTrans(x, trans);
 
-  // step3: update the dist attributes of input
-  // and output with the inferred dims mapping
-  TensorDistAttr new_input_dist_attr(input_specs[0].dist_attr());
-  new_input_dist_attr.set_dims_mapping(dims_mapping_vec[0]);
-  TensorDistAttr output_dist_attr(input_specs[0].dist_attr());
-  output_dist_attr.set_dims_mapping(dims_mapping_vec[1]);
+  // Step3: Update the dist attributes of input
+  // and output with the inferred dims mapping.
+  TensorDistAttr x_dist_attr_dst(x_dist_attr_src);
+  x_dist_attr_dst.set_dims_mapping(dims_mapping_vec[0]);
+  TensorDistAttr out_dist_attr(x_dist_attr_src);
+  out_dist_attr.set_dims_mapping(dims_mapping_vec[1]);
 
-  VLOG(4) << "Reshape: input_shape: [" << str_join(src_shape)
-          << "] output_shape: [" << str_join(tgt_shape) << "]";
+  VLOG(4) << "ReshapeInferSpmd: X shape: [" << str_join(src_shape)
+          << "] Out shape: [" << str_join(tgt_shape) << "]";
   VLOG(4) << "Transformation from input to output:";
   for (int64_t i = 0, n = static_cast<int64_t>(trans.size()); i < n; i++) {
     DimTrans* t = trans[i];
-    VLOG(4) << "\tOutput axis " << i << ": " << t->to_string();
+    VLOG(4) << "\tOut axis[" << i << "]: " << t->to_string();
   }
-  VLOG(4) << "input_dims_mapping: [" << str_join(dims_mapping_vec[0])
-          << "] output_dims_mapping: [" << str_join(dims_mapping_vec[1])
+  VLOG(4) << "X dims_mapping_src: [" << str_join(x_dims_mapping)
+          << "] dims_mapping_dst: [" << str_join(dims_mapping_vec[0])
+          << "]\n Out dims_mapping: [" << str_join(dims_mapping_vec[1])
           << "]\n\n";
 
   CleanUp();
 
-  return {{new_input_dist_attr}, {output_dist_attr}};
+  return {{x_dist_attr_dst}, {out_dist_attr}};
 }
 
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-paddle::distributed::auto_parallel::ReshapeSPMDRule::InferBackward(
-    const std::vector<DistTensorSpec>& input_specs,
-    const std::vector<DistTensorSpec>& output_specs,
-    const paddle::framework::AttributeMap& attrs) {
-  // step0: Verify Input Args Based on Reshape Logic
-  int64_t ninputs = input_specs.size();
-  int64_t noutputs = output_specs.size();
+SpmdInfo ReshapeInferSpmdReverse(const DistMetaTensor& x,
+                                 const DistMetaTensor& out,
+                                 const std::vector<int64_t>& shape) {
+  // Step0: Verify input args based on reshape logic
+  auto x_shape = phi::vectorize(x.dims());
+  auto out_shape = phi::vectorize(out.dims());
+  int out_ndim = out_shape.size();
+  auto out_dist_attr_src = out.dist_attr();
+  std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
   PADDLE_ENFORCE_EQ(
-      ninputs,
-      1,
-      phi::errors::InvalidArgument("The size of InputSpec in reshape must "
-                                   "be equal to 1, but got [%d].",
-                                   ninputs));
-  PADDLE_ENFORCE_EQ(
-      noutputs,
-      1,
-      phi::errors::InvalidArgument("The size of OutputSpec in reshape must "
-                                   "be equal to 1, but got [%d].",
-                                   noutputs));
-  VerifySpecs(output_specs, "reshape");
-
-  // step1: build the transformation from the output shape
-  // to original shape. Inferbackward infers the dims mapping
+      out_ndim,
+      out_dims_mapping.size(),
+      phi::errors::InvalidArgument("The Tensor Out's rank [%d] and Out's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   out_ndim,
+                                   out_dims_mapping.size()));
+
+  // Step1: Build the transformation from the output shape
+  // to original shape. This function infers the dims mapping
   // from output to input, we first get the transformation
   // from output to input so that we can infer the dims mapping
   // with the map from output axes to input axes.
-  // Shapes in Inferbackward don't contain -1 or 0, so they will
-  // not be modified and we can use ref here.
-  const std::vector<int64_t>& output_shape = output_specs[0].shape();
-  const std::vector<int64_t>& input_shape = input_specs[0].shape();
+  // Shapes in InferSpmdReverse don't contain -1 or 0, so they will
+  // not be modified and we can directly use them.
+  std::vector<DimTrans*> trans = MakeReshapeDimTrans(out_shape, x_shape);
 
-  std::vector<DimTrans*> trans = MakeReshapeDimTrans(output_shape, input_shape);
-
-  // step2: infer the dims mapping of input with
+  // Step2: Infer the dims mapping of input with
   // output's dims_mapping and the transformation.
   std::vector<std::vector<int64_t>> dims_mapping_vec =
-      InferFromDimTrans(output_specs[0], trans);
+      InferFromDimTrans(out, trans);
 
-  // step3: update the dist attributes of input
+  // Step3: Update the dist attributes of input
   // and output with the inferred dims mapping
-  TensorDistAttr new_output_dist_attr(output_specs[0].dist_attr());
-  new_output_dist_attr.set_dims_mapping(dims_mapping_vec[0]);
-  TensorDistAttr input_dist_attr(input_specs[0].dist_attr());
-  input_dist_attr.set_dims_mapping(dims_mapping_vec[1]);
+  TensorDistAttr out_dist_attr_dst(out_dist_attr_src);
+  out_dist_attr_dst.set_dims_mapping(dims_mapping_vec[0]);
+  TensorDistAttr x_dist_attr(x.dist_attr());
+  x_dist_attr.set_dims_mapping(dims_mapping_vec[1]);
 
-  VLOG(4) << "Reshape Inferbackward: output_shape: [" << str_join(output_shape)
-          << "] input_shape: [" << str_join(input_shape) << "]";
+  VLOG(4) << "ReshapeInferSpmdReverse: Out shape: [" << str_join(out_shape)
+          << "] X shape: [" << str_join(x_shape) << "]";
   VLOG(4) << "Transformation from output to input:";
   for (int64_t i = 0, n = trans.size(); i < n; i++) {
     DimTrans* t = trans[i];
-    VLOG(4) << "\tInput axis " << i << ": " << t->to_string();
+    VLOG(4) << "\tX axis[" << i << "]: " << t->to_string();
   }
-  VLOG(4) << "input_dims_mapping: [" << str_join(dims_mapping_vec[1])
-          << "] output_dims_mapping: [" << str_join(dims_mapping_vec[0])
-          << "]\n\n";
+  VLOG(4) << "Out dims_mapping_src: [" << str_join(out_dims_mapping) << "] "
+          << "dims_mapping_dst: [" << str_join(dims_mapping_vec[0]) << "]";
+  VLOG(4) << "X dims_mapping: [" << str_join(dims_mapping_vec[1]) << "]\n\n";
 
   CleanUp();
 
-  return {{input_dist_attr}, {new_output_dist_attr}};
+  return {{x_dist_attr}, {out_dist_attr_dst}};
 }
 
-}  // namespace auto_parallel
 }  // namespace distributed
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/reshape.h b/paddle/phi/infermeta/spmd_rules/reshape.h
new file mode 100644
index 0000000000000..394f31c2b8cf3
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/reshape.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
+namespace phi {
+namespace distributed {
+
+SpmdInfo ReshapeInferSpmd(const DistMetaTensor& x,
+                          const std::vector<int64_t>& shape);
+
+SpmdInfo ReshapeInferSpmdReverse(const DistMetaTensor& x,
+                                 const DistMetaTensor& out,
+                                 const std::vector<int64_t>& shape);
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index cb01b8996a8c9..0d14f7da7abe9 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
 #include "paddle/phi/infermeta/spmd_rules/reduction.h"
 #include "paddle/phi/infermeta/spmd_rules/replicated.h"
+#include "paddle/phi/infermeta/spmd_rules/reshape.h"
 
 /**
  * Design Notes:
@@ -464,5 +465,10 @@ PD_REGISTER_SPMD_RULE(
     PD_INFER_SPMD(phi::distributed::LayerNormInferSpmd),
     PD_INFER_SPMD(phi::distributed::LayerNormInferSpmdReverse));
 
+// reshape rule
+PD_REGISTER_SPMD_RULE(reshape,
+                      PD_INFER_SPMD(phi::distributed::ReshapeInferSpmd),
+                      PD_INFER_SPMD(phi::distributed::ReshapeInferSpmdReverse));
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/test/auto_parallel/spmd_rules/test_reshape_rule.py b/test/auto_parallel/spmd_rules/test_reshape_rule.py
index dd7c248ca42fb..a370580682d8c 100644
--- a/test/auto_parallel/spmd_rules/test_reshape_rule.py
+++ b/test/auto_parallel/spmd_rules/test_reshape_rule.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 
 import unittest
+from collections import OrderedDict
 
-from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
 from paddle.distributed.auto_parallel.static.dist_attribute import (
     DistTensorSpec,
     TensorDistAttr,
 )
 from paddle.distributed.fleet import auto
+from paddle.framework import core
 
 
 class TestReshapeSPMDRule(unittest.TestCase):
     def setUp(self):
-        self.rule = get_spmd_rule("reshape")
+        self.rule = core.get_phi_spmd_rule("reshape")
 
         x_shape = [6, 12, 48, 24]
         process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
@@ -34,14 +35,14 @@ def setUp(self):
         x_tensor_dist_attr.process_mesh = process_mesh
         self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
 
-        self.attrs = {"shape": [1, 72, 48, 4, 6]}
+        self.attrs = OrderedDict([('shape', [1, 72, 48, 4, 6])])
 
     def test_reshape_infer_forward(self):
         # shape: [6, 12, 48, 24] --> [1, 72, 48, 4, 6]
         # dims_mapping: [0, -1, 1, -1] --> [0, -1, 1, -1] [-1, 0, 1, -1, -1]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1, 1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -59,7 +60,7 @@ def test_reshape_infer_forward(self):
         # dims_mapping: [-1, 0, -1, 1] --> [-1, -1, -1, -1] [-1, -1, -1, -1, -1]
         self.x_dist_tensor_spec.set_dims_mapping([-1, 0, -1, 1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -75,7 +76,7 @@ def test_reshape_infer_forward(self):
         # dims_mapping: [1, -1, -1, 0] --> [1, -1, -1, 0] [-1, 1, -1, 0, -1]
         self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -93,7 +94,7 @@ def test_reshape_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1])
 
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -110,7 +111,7 @@ def test_reshape_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0])
 
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -128,7 +129,7 @@ def test_reshape_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1])
 
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -145,7 +146,7 @@ def test_reshape_infer_forward(self):
         self.attrs["shape"] = [1, 72, 0, 4, 6]
         self.x_dist_tensor_spec.set_dims_mapping([1, -1, -1, 0])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -162,7 +163,7 @@ def test_reshape_infer_forward(self):
         self.attrs["shape"] = [6, 12, 48, 24]
         self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -179,7 +180,7 @@ def test_reshape_infer_forward(self):
         self.attrs["shape"] = [72, 3, 16, 24]
         self.x_dist_tensor_spec.set_dims_mapping([0, -1, 1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -196,7 +197,7 @@ def test_reshape_infer_forward(self):
         self.attrs["shape"] = [72, 3, 16, 24]
         self.x_dist_tensor_spec.set_dims_mapping([1, -1, 0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -214,7 +215,7 @@ def test_reshape_infer_forward(self):
         self.attrs["shape"] = [6, 12, 48, 24]
         self.x_dist_tensor_spec.set_dims_mapping([-1, 1, -1, 0, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -232,7 +233,7 @@ def test_reshape_infer_forward(self):
         self.attrs["shape"] = [0, 0, -1, 192]
         self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1])
         result_dist_attrs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec, self.attrs['shape']
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -246,7 +247,9 @@ def test_reshape_infer_forward(self):
         # raise error
         self.attrs["shape"] = [3, 24, 6, -1, -1]
         with self.assertRaises(BaseException):
-            self.rule.infer_forward([self.x_dist_tensor_spec], self.attrs)
+            self.rule.infer_forward(
+                self.x_dist_tensor_spec, self.attrs['shape']
+            )
 
     def test_reshape_infer_backward(self):
         process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
@@ -262,9 +265,9 @@ def test_reshape_infer_backward(self):
         )
         self.output_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec],
-            [self.output_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -283,9 +286,9 @@ def test_reshape_infer_backward(self):
         self.output_dist_tensor_spec.shape = [1, 72, 48, 4, 6]
         self.output_dist_tensor_spec.set_dims_mapping([-1, -1, -1, -1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec],
-            [self.output_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -302,9 +305,9 @@ def test_reshape_infer_backward(self):
         self.output_dist_tensor_spec.shape = [1, 72, 48, 4, 6]
         self.output_dist_tensor_spec.set_dims_mapping([-1, 1, -1, 0, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec],
-            [self.output_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -322,9 +325,9 @@ def test_reshape_infer_backward(self):
         self.output_dist_tensor_spec.set_dims_mapping([1, -1, -1, -1, 0])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec],
-            [self.output_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -342,9 +345,9 @@ def test_reshape_infer_backward(self):
         self.output_dist_tensor_spec.set_dims_mapping([-1, -1, 0, -1, 1])
 
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec],
-            [self.output_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -361,9 +364,9 @@ def test_reshape_infer_backward(self):
         self.output_dist_tensor_spec.shape = [6, 12, 48, 24]
         self.output_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec],
-            [self.output_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -380,9 +383,9 @@ def test_reshape_infer_backward(self):
         self.output_dist_tensor_spec.shape = [72, 3, 16, 24]
         self.output_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec],
-            [self.output_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -399,9 +402,9 @@ def test_reshape_infer_backward(self):
         self.output_dist_tensor_spec.shape = [72, 3, 16, 24]
         self.output_dist_tensor_spec.set_dims_mapping([1, -1, -1, -1])
         result_dist_attrs = self.rule.infer_backward(
-            [self.x_dist_tensor_spec],
-            [self.output_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -413,6 +416,44 @@ def test_reshape_infer_backward(self):
             infered_output_dist_attrs[0].dims_mapping, [1, -1, -1, -1]
         )
 
+        # shape: [6, 12, 48, 24] --> [1, 72, 48, 4, 6] (intput --> output)
+        # dims_mapping: [-1, 0, -1, -1, 1] --> [0, -1, -1, -1], [-1, 0, -1, -1, -1] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [1, 72, 48, 4, 6]
+        self.output_dist_tensor_spec.set_dims_mapping([-1, 0, -1, -1, 1])
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, 0, -1, -1, -1]
+        )
+
+        # shape: [6, 12, 48, 24] --> [3, 24, 6, 8, 24] (intput --> output)
+        # dims_mapping: [-1, 1, -1, -1, 0] --> [-1, -1, -1, 0], [-1, -1, -1, -1, 0] (output --> input, output)
+        self.output_dist_tensor_spec.shape = [3, 24, 6, 8, 24]
+        self.output_dist_tensor_spec.set_dims_mapping([-1, 1, -1, -1, 0])
+        result_dist_attrs = self.rule.infer_backward(
+            self.x_dist_tensor_spec,
+            self.output_dist_tensor_spec,
+            self.attrs['shape'],
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, 0]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1, -1, 0]
+        )
+
 
 if __name__ == "__main__":
     unittest.main()

From cd9ccb003eacb2d7f7ec6ac03c023a686a17aa69 Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Mon, 25 Sep 2023 15:17:58 +0800
Subject: [PATCH 103/115] [PIR]Open more dropout pir uts (#57549)

---
 test/legacy_test/op_test.py         | 31 ++++++++++++++++++++++-------
 test/legacy_test/test_dropout_op.py | 30 +++++++++++++++++-----------
 2 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
index 961820ff00b29..2355598ad98b8 100644
--- a/test/legacy_test/op_test.py
+++ b/test/legacy_test/op_test.py
@@ -1289,6 +1289,17 @@ def get_ir_input_attr_dict_and_feed(self, stop_gradient):
                 input_dict.update({name: x})
         return static_inputs, attrs_outputs, input_dict, feed
 
+    def _need_fetch(self, sig_name):
+        if sig_name in self.outputs.keys():
+            return True
+        for _, value in self.outputs.items():
+            if not isinstance(value, (tuple, list)):
+                continue
+            for var_name, _ in value:
+                if sig_name == var_name:
+                    return True
+        return False
+
     def _calc_new_ir_output(
         self, place, no_check_set=None, inps=None, oups=None
     ):
@@ -1329,6 +1340,8 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
                     kernel_sig,
                 )
                 inputs_sig, attrs_sig, outputs_sig = kernel_sig
+                if hasattr(self, "python_out_sig"):
+                    outputs_sig = self.python_out_sig
                 args = OpTestUtils.assumption_assert_and_transform(
                     args, len(inputs_sig)
                 )
@@ -1339,9 +1352,12 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
 
                 if len(fetch_list) == 0:
                     if isinstance(ret_tuple, (tuple, list)):
-                        for var in ret_tuple:
+                        assert len(ret_tuple) == len(outputs_sig)
+                        for var, sig_name in zip(ret_tuple, outputs_sig):
                             if no_check_set is not None and var in no_check_set:
                                 continue
+                            if not self._need_fetch(sig_name):
+                                continue
                             if isinstance(var, list):
                                 for v in var:
                                     fetch_list.append(v)
@@ -1362,6 +1378,11 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
                     ir_program, feed=feed, fetch_list=[fetch_list]
                 )
 
+                outputs_sig = [
+                    sig_name
+                    for sig_name in outputs_sig
+                    if self._need_fetch(sig_name)
+                ]
                 result = construct_output_dict_by_kernel_sig(outs, outputs_sig)
                 if hasattr(self, "python_out_sig_sub_name"):
                     for key in self.python_out_sig_sub_name.keys():
@@ -2392,14 +2413,10 @@ def find_imperative_actual(target_name, new_ir_outs, place):
                         f"Found failed {new_ir_outs.keys()} {target_name}",
                     )
 
-            def find_imperative_expect(target_name, new_ir_outs, place):
+            def find_imperative_expect(self, target_name, new_ir_outs, place):
                 for name in new_ir_outs:
                     if name == target_name:
                         return new_ir_outs[name][0]
-                    var_list = new_ir_outs[name]
-                    for i, var in enumerate(var_list):
-                        if var.name == target_name:
-                            return new_ir_outs[name][i]
                 self.assertTrue(
                     False,
                     f"Found failed {new_ir_outs.keys()} {target_name}",
@@ -2419,7 +2436,7 @@ def find_expect_value(self, target_name):
                 with paddle.pir.core.program_guard(
                     paddle.pir.core.default_main_program()
                 ):
-                    expect = find_imperative_expect(
+                    expect = self.find_imperative_expect(
                         target_name, self.ref_outputs, place
                     )
                     expect_t = np.array(expect)
diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py
index b4eb567d40f9e..589266cb42495 100644
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
@@ -124,11 +124,11 @@ def setUp(self):
         self.enable_check_static_comp = False
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
     def test_check_grad_normal(self):
         # Now in dy2st mode x_grad = [], so set check_prim=False
-        self.check_grad(['X'], 'Out', check_prim=False)
+        self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=True)
 
 
 class TestDropoutOp2(TestDropoutOp):
@@ -191,7 +191,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
@@ -208,7 +208,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
 
 class TestDropoutOp6(TestDropoutOp):
@@ -270,7 +270,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X']}
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
@@ -289,7 +289,7 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X']}
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
 
 class TestDropoutOpWithSeed(OpTest):
@@ -315,11 +315,17 @@ def setUp(self):
         self.enable_check_static_comp = False
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
     def test_check_grad_normal(self):
         # Now in dy2st mode x_grad = [], so set check_prim=False
-        self.check_grad(['X'], 'Out', max_relative_error=0.05, check_prim=False)
+        self.check_grad(
+            ['X'],
+            'Out',
+            max_relative_error=0.05,
+            check_prim=False,
+            check_new_ir=True,
+        )
 
 
 @unittest.skipIf(
@@ -352,11 +358,11 @@ def init_test_case(self):
 
     def test_check_output(self):
         self.check_output_with_place(
-            core.CUDAPlace(0), atol=1e-3, check_prim=True
+            core.CUDAPlace(0), atol=1e-3, check_prim=True, check_new_ir=True
         )
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_new_ir=True)
 
 
 @unittest.skipIf(
@@ -391,10 +397,10 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):

From 79404e73e83d0799397b9becfc50959a7f3a5052 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Mon, 25 Sep 2023 15:23:30 +0800
Subject: [PATCH 104/115] [PIR] Refine execution info data struct for
 interpreter (#57594)

* add

* fix

* add

* refine

* fix

* fix

* fix

* fix

* add

* fix

* fix
---
 paddle/fluid/framework/CMakeLists.txt         |   2 +-
 .../framework/new_executor/CMakeLists.txt     |   3 +-
 .../instruction/cond_instruction.cc           |  87 ++--
 .../instruction/cond_instruction.h            |   6 +-
 .../instruction/instruction_util.cc           |   2 +-
 .../instruction/legacy_kernel_instruction.cc  |   2 +-
 .../instruction/phi_kernel_instruction.cc     |   2 +-
 .../interpreter/interpreter_util.cc           |  18 +-
 .../interpreter/interpreter_util.h            |   4 +-
 .../new_executor/new_ir_interpreter.cc        | 210 ++++++----
 .../new_executor/new_ir_interpreter.h         |  23 +-
 .../new_executor/pir_adaptor}/CMakeLists.txt  |   6 +-
 .../pir_adaptor/pir_adaptor_util.cc}          | 388 ++++++++++--------
 .../pir_adaptor/pir_adaptor_util.h}           |  84 +++-
 paddle/fluid/pir/CMakeLists.txt               |   1 -
 .../phi_kernel_adaptor/phi_kernel_adaptor.h   | 145 -------
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 test/cpp/new_executor/CMakeLists.txt          |   1 -
 test/cpp/pir/core/CMakeLists.txt              |  14 -
 test/cpp/pir/core/ir_exe_test.cc              | 206 ----------
 test/cpp/pir/kernel_dialect/CMakeLists.txt    |   2 +-
 .../ir_kernel_dialect_pass_test.cc            |  58 +--
 test/cpp/prim/CMakeLists.txt                  |   2 +-
 23 files changed, 481 insertions(+), 787 deletions(-)
 rename paddle/fluid/{pir/phi_kernel_adaptor => framework/new_executor/pir_adaptor}/CMakeLists.txt (64%)
 rename paddle/fluid/{pir/phi_kernel_adaptor/phi_kernel_util.cc => framework/new_executor/pir_adaptor/pir_adaptor_util.cc} (73%)
 rename paddle/fluid/{pir/phi_kernel_adaptor/phi_kernel_util.h => framework/new_executor/pir_adaptor/pir_adaptor_util.h} (88%)
 delete mode 100644 paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h
 delete mode 100644 test/cpp/pir/core/ir_exe_test.cc

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 6d518b348d7d9..82cb18e1110fb 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1049,7 +1049,7 @@ cc_library(
 cc_library(
   executor_cache
   SRCS executor_cache.cc
-  DEPS parallel_executor standalone_executor phi_kernel_adaptor pd_inplace_pass
+  DEPS parallel_executor standalone_executor pir_adaptor pd_inplace_pass
        pd_op_to_kernel_pass pir)
 if(WITH_PSCORE)
   get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index ae30121bc930b..d10e2258d00b6 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(garbage_collector)
 add_subdirectory(instruction)
 add_subdirectory(interpreter)
 add_subdirectory(workqueue)
+add_subdirectory(pir_adaptor)
 
 set(STANDALONE_EXECUTOR_SRCS
     feed_fetch_utils.cc interpretercore.cc new_executor_defs.cc
@@ -13,7 +14,7 @@ set(STANDALONE_EXECUTOR_DEPS
     workqueue
     pd_op_dialect
     pd_op_to_kernel_pass
-    phi_kernel_adaptor
+    pir_adaptor
     program_translator
     instruction_base
     pd_inplace_pass
diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc
index 00d6912f0a4ac..5d958d7266505 100644
--- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/cond_instruction.cc
@@ -17,12 +17,12 @@
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
 #include "paddle/fluid/framework/new_executor/new_ir_interpreter.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/infermeta_utils.h"
@@ -123,40 +123,10 @@ CondInstruction::CondInstruction(
     pir::Operation* op,
     Scope* scope,
     Scope* local_scope,
-    const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name,
+    ValueExecutionInfo* parent_exe_info,
     const std::map<pir::Block*, paddle::framework::Scope*>& sub_blocks)
     : InstructionBase(id, place) {
   op_ = op;
-  // Todo: support paddle::dialect::DistAttribute
-  //   if (op_attributes.count("dist_attr") != 0) {
-  //     if (op_attributes.count("execution_stream") != 0) {
-  //         SetExecutionStream(op_attributes.at("execution_stream")
-  //                             .dyn_cast<::ir::StrAttribute>()
-  //                             .data());
-  //     }
-  //     if (op_attributes.count("stream_priority") != 0) {
-  //         SetStreamPriority(op_attributes.at("stream_priority")
-  //                             .dyn_cast<::ir::Int32Attribute>()
-  //                             .data());
-  //     }
-  //     if (op_attributes.count("scheduling_priority") != 0) {
-  //         SetSchedulingPriority(op_attributes.at("scheduling_priority")
-  //                                 .dyn_cast<::ir::Int64Attribute>()
-  //                                 .data());
-  //     }
-  //   } else {
-  //     if (interpreter::IsCommunicationOp(op)) {
-  //       // NOTE(Ruibiao): Dispatching computation before communication
-  //       improves
-  //       // multi-stream overlap when the time cost of communication less than
-  //       // that of the calculation (e.g., ResNet50_bs128_pure_fp16 N4C32
-  //       // training).
-  //       op_func_node.scheduling_priority_ = 1;
-  //     }
-  //   }
   VLOG(6) << "finish process dist attributes";
 
   SetKernelType(AnalyseOpFuncType(op, place));
@@ -173,12 +143,12 @@ CondInstruction::CondInstruction(
   auto if_op = op->dyn_cast<paddle::dialect::IfOp>();
 
   for (size_t i = 0; i < if_op.num_results(); ++i) {
-    if_op_outputs_.push_back(
-        inner_scope->GetVar(value_2_var_name.at(if_op.result(i))));
+    if_op_outputs_.push_back(inner_scope->GetVar(
+        parent_exe_info->GetValue2VarName().at(if_op.result(i))));
   }
 
   auto cond_value = if_op.operand_source(0);
-  auto var_name = value_2_var_name.at(cond_value);
+  auto var_name = parent_exe_info->GetValue2VarName().at(cond_value);
   cond_var = inner_scope->FindVar(var_name);
 
   auto true_branch_block = if_op.true_block();
@@ -189,7 +159,12 @@ CondInstruction::CondInstruction(
 
   auto true_scope = sub_blocks.at(true_branch_block);
   true_branch_inter =
-      new NewIRInterpreter(place, {}, true_branch_block, true_scope, {});
+      new NewIRInterpreter(place,
+                           {},
+                           true_branch_block,
+                           true_scope,
+                           parent_exe_info->NewChild(true_scope),
+                           {});
 
   std::set<std::string> true_skip_gc_names_set;
   for (auto value : true_branch_yied_inputs) {
@@ -200,7 +175,12 @@ CondInstruction::CondInstruction(
 
   auto false_scope = sub_blocks.at(false_branch_block);
   false_branch_inter =
-      new NewIRInterpreter(place, {}, false_branch_block, false_scope, {});
+      new NewIRInterpreter(place,
+                           {},
+                           false_branch_block,
+                           false_scope,
+                           parent_exe_info->NewChild(false_scope),
+                           {});
 
   std::set<std::string> false_skip_gc_names_set;
   for (auto value : false_branch_yied_inputs) {
@@ -214,22 +194,22 @@ CondInstruction::CondInstruction(
   std::unordered_map<pir::Value, std::vector<int>> inputs;
   GetInputIds(op,
               inner_scope,
-              value_2_var_name,
-              var_name_2_id,
-              variable_2_var_name,
+              parent_exe_info->GetValue2VarName(),
+              parent_exe_info->GetVarName2Id(),
+              parent_exe_info->GetVar2VarName(),
               &inputs);
   GetOutsideOpInputs(true_branch_block,
                      inner_scope,
-                     value_2_var_name,
-                     var_name_2_id,
-                     variable_2_var_name,
+                     parent_exe_info->GetValue2VarName(),
+                     parent_exe_info->GetVarName2Id(),
+                     parent_exe_info->GetVar2VarName(),
                      &inputs);
 
   GetOutsideOpInputs(false_branch_block,
                      inner_scope,
-                     value_2_var_name,
-                     var_name_2_id,
-                     variable_2_var_name,
+                     parent_exe_info->GetValue2VarName(),
+                     parent_exe_info->GetVarName2Id(),
+                     parent_exe_info->GetVar2VarName(),
                      &inputs);
   SetInputs(inputs);
 
@@ -238,17 +218,18 @@ CondInstruction::CondInstruction(
     pir::Value value = op->result(i);
     if (value && value.type()) {
       PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
+          parent_exe_info->GetValue2VarName().find(value),
+          parent_exe_info->GetValue2VarName().end(),
           phi::errors::PreconditionNotMet(
               "input should in name map, [%d] 'th input of [%s] op",
               i,
               "if op"));
-      std::vector<int> outputs_id = GetValueIds(value,
-                                                inner_scope,
-                                                value_2_var_name,
-                                                var_name_2_id,
-                                                variable_2_var_name);
+      std::vector<int> outputs_id =
+          GetValueIds(value,
+                      inner_scope,
+                      parent_exe_info->GetValue2VarName(),
+                      parent_exe_info->GetVarName2Id(),
+                      parent_exe_info->GetVar2VarName());
       outputs.emplace(value, outputs_id);
     }
   }
diff --git a/paddle/fluid/framework/new_executor/instruction/cond_instruction.h b/paddle/fluid/framework/new_executor/instruction/cond_instruction.h
index e848de136da72..75eb7d0ece04f 100644
--- a/paddle/fluid/framework/new_executor/instruction/cond_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/cond_instruction.h
@@ -25,6 +25,7 @@ namespace framework {
 class Scope;
 class Value;
 class NewIRInterpreter;
+class ValueExecutionInfo;
 
 class CondInstruction : public InstructionBase {
  public:
@@ -34,10 +35,7 @@ class CondInstruction : public InstructionBase {
       ::pir::Operation* op,
       Scope* scope,
       Scope* local_scope,
-      const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
-      const std::map<std::string, int>& var_name_2_id,
-      const std::unordered_map<const paddle::framework::Variable*, std::string>&
-          variable_2_var_name,
+      ValueExecutionInfo* parent_exe_info,
       const std::map<pir::Block*, paddle::framework::Scope*>& sub_blocks);
 
   void Run() override;
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
index dfa8e1ec85f9f..dfafd44281537 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -28,7 +28,7 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
index 50623c6eb1118..748c7e603f7d7 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -17,12 +17,12 @@
 #include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
index 849a83fcf2ce9..e779fb52f26e4 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
@@ -16,12 +16,12 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/infermeta_utils.h"
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 8015a50545e69..7002e2e787680 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h"
 #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
 #include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/pylayer_op_helper.h"
@@ -32,7 +33,6 @@
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/kernel_context.h"
@@ -1258,8 +1258,8 @@ const paddle::framework::Variable* GetVariableByName(
 
 void PrintValuesAndVariables(
     const pir::Block& block,
-    const std::unordered_map<pir::Value, std::string>* value_2_var_name,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>*
+    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
+    const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name) {
   std::stringstream ss;
   for (const auto& op : block) {
@@ -1272,10 +1272,10 @@ void PrintValuesAndVariables(
     std::string ret_variable_str = "Variable: (";
     if (!op->results().empty()) {
       for (auto& out_value : op->results()) {
-        if ((*value_2_var_name).count(out_value)) {
-          auto& var_name = (*value_2_var_name).at(out_value);
+        if (value_2_var_name.count(out_value)) {
+          auto& var_name = value_2_var_name.at(out_value);
           const paddle::framework::Variable* out_variable =
-              GetVariableByName(var_name, *variable_2_var_name);
+              GetVariableByName(var_name, variable_2_var_name);
           ss.str("");
           ss << out_value.impl();
           ret_value_str +=
@@ -1319,10 +1319,10 @@ void PrintValuesAndVariables(
     if (!op->operands().empty()) {
       for (auto& input : op->operands()) {
         ::pir::Value in_value = input.source();
-        if ((*value_2_var_name).count(in_value)) {
-          auto& var_name = (*value_2_var_name).at(in_value);
+        if (value_2_var_name.count(in_value)) {
+          auto& var_name = value_2_var_name.at(in_value);
           const paddle::framework::Variable* in_variable =
-              GetVariableByName(var_name, *variable_2_var_name);
+              GetVariableByName(var_name, variable_2_var_name);
           ss.str("");
           ss << in_value.impl();
           ret_value_str +=
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
index a823cb3bc9cf0..213804ec980f6 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
@@ -136,8 +136,8 @@ const paddle::framework::Variable* GetVariableByName(
 
 void PrintValuesAndVariables(
     const pir::Block& block,
-    const std::unordered_map<pir::Value, std::string>* value_2_var_name,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>*
+    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
+    const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name);
 
 const std::vector<std::string> GetInstructionCallStack(
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 2dc6181180c9d..ee2c4c3ea62ed 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -42,12 +42,12 @@
 #include "paddle/fluid/framework/new_executor/instruction/cond_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/pir/core/builtin_attribute.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
@@ -70,6 +70,7 @@ NewIRInterpreter::NewIRInterpreter(
       ir_stream_analyzer_(place),
       fetch_var_names_(fetch_var_names) {
   VLOG(4) << "NewIRInterpreter(): " << this << " on " << place_;
+
   static_build_ = FLAGS_new_executor_static_build &&
                   !FLAGS_new_executor_use_cuda_graph &&
                   !execution_config.used_for_control_flow_op;
@@ -109,18 +110,73 @@ NewIRInterpreter::NewIRInterpreter(
 
   PrepareForCUDAGraphCapture();
 
+  value_exe_info_ = std::make_shared<ValueExecutionInfo>(InnerScope());
+
   std::stringstream ss;
   ss << this;
-  ::pir::BuildScope(*ir_block_,
-                    InnerScope(),
-                    ss.str(),
-                    &value_2_var_name_,
-                    &variable_2_var_name_,
-                    &var_name_2_id_,
-                    &variable_list_,
-                    &sub_blocks_);
-
-  interpreter::BuildId2VarName(var_name_2_id_, &id_2_var_name_);
+  ::pir::BuildScope(*ir_block_, ss.str(), &sub_blocks_, value_exe_info_.get());
+}
+
+NewIRInterpreter::NewIRInterpreter(
+    const platform::Place& place,
+    const std::vector<std::string>& fetch_var_names,
+    const ::pir::Block* ir_block,
+    framework::Scope* scope,
+    std::shared_ptr<ValueExecutionInfo> value_exe_info,
+    const ExecutionConfig& execution_config)
+    : place_(place),
+      execution_config_(execution_config),
+      var_scope_(scope),
+      scope_(scope),
+      ir_block_(ir_block),
+      ir_stream_analyzer_(place),
+      fetch_var_names_(fetch_var_names) {
+  VLOG(4) << "NewIRInterpreter(): " << this << " on " << place_;
+
+  static_build_ = FLAGS_new_executor_static_build &&
+                  !FLAGS_new_executor_use_cuda_graph &&
+                  !execution_config.used_for_control_flow_op;
+  //    &&interpreter::BlockCanBeStaticBuilt(block);
+  static_build_ = true;
+
+  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
+  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
+
+  dependecy_count_ = std::make_shared<std::vector<size_t>>();
+
+  if (!FLAGS_new_executor_use_local_scope) {
+    execution_config_.create_local_scope = false;
+  }
+  if (execution_config_.create_local_scope) {
+    auto local_scope = &scope_->NewScope();
+    local_scope_ = local_scope;
+    VLOG(6) << "new ir interpretercore scope: " << scope_ << "\t"
+            << "; local scope: " << local_scope_;
+  }
+  // TODO(zhangbo): delete var_scope
+  var_scope_.SetLocalScope(local_scope_);
+
+  execution_config_.AnalyzeThreadPoolConfig(place, 1);
+  execution_config_.Log(/*log_level=*/8);
+
+  ir_instruction_scheduling_priority_less = [this](size_t lhs, size_t rhs) {
+    SchedulingPriority lhs_scheduling_priority =
+        vec_instruction_base_[lhs]->GetSchedulingPriority();
+    SchedulingPriority rhs_scheduling_priority =
+        vec_instruction_base_[rhs]->GetSchedulingPriority();
+    if (lhs_scheduling_priority == rhs_scheduling_priority) {
+      return lhs < rhs;
+    }
+    return lhs_scheduling_priority > rhs_scheduling_priority;
+  };
+
+  PrepareForCUDAGraphCapture();
+
+  value_exe_info_ = value_exe_info;
+
+  std::stringstream ss;
+  ss << this;
+  ::pir::BuildScope(*ir_block_, ss.str(), &sub_blocks_, value_exe_info_.get());
 }
 
 NewIRInterpreter::~NewIRInterpreter() {
@@ -136,14 +192,6 @@ NewIRInterpreter::~NewIRInterpreter() {
 #endif
 }
 
-int NewIRInterpreter::GetIdByName(const std::string& name) const {
-  auto it = var_name_2_id_.find(name);
-  if (it != var_name_2_id_.end()) {
-    return it->second;
-  }
-  return -1;
-}
-
 void NewIRInterpreter::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
   PADDLE_THROW(platform::errors::Unimplemented(
       "SetCopyProgram is not implemented in NewIRInterpreter."));
@@ -184,35 +232,23 @@ const VariableScope* NewIRInterpreter::GetVariableScope() const {
 void NewIRInterpreter::reset_scope(Scope* new_scope) {
   var_scope_.SetScope(new_scope);
   scope_ = new_scope;
-  for (size_t i = 0; i < variable_list_.size(); i++) {
-    const auto& var_name = GetNameById(static_cast<int>(i));
-    variable_list_[i] = new_scope->FindVar(var_name);
+  for (size_t i = 0; i < value_exe_info_->GetVarList().size(); i++) {
+    const auto& var_name = value_exe_info_->GetNameById(static_cast<int>(i));
+    value_exe_info_->ResetVarList(i, new_scope->FindVar(var_name));
   }
   // The index should be assured valid, cause the InterpreterCore may not be
   // fully built, but was still cached and used. For example, see unit test
   // `test_assert.py`, it may exit before `NewIRInterpreter::Convert`,
   // but still was cached and used by later tests.
-  for (size_t i = 0; i < std::min(refs_.size(), variable_list_.size()); i++) {
-    refs_[i]->ResetVariable(variable_list_[i]);
+  for (size_t i = 0;
+       i < std::min(refs_.size(), value_exe_info_->GetVarList().size());
+       i++) {
+    refs_[i]->ResetVariable(value_exe_info_->GetVarList()[i]);
   }
 }
 
 const Scope* NewIRInterpreter::local_scope() const { return local_scope_; }
 
-std::string NewIRInterpreter::GetNameById(int id) const {
-  // NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since
-  // vec_meta_info_[id] may be nullptr,
-  // typically when the target variable is not existed in the original program
-  // desc, but created by interpretercore.
-  // For example, created and used by d2h_copy or h2d_copy operator.
-
-  auto it = id_2_var_name_.find(id);
-  if (it != id_2_var_name_.end()) {
-    return it->second;
-  }
-  return "";
-}
-
 void NewIRInterpreter::ShareWorkQueueFrom(InterpreterBaseImpl* src) {
   async_work_queue_ = reinterpret_cast<NewIRInterpreter*>(src)->GetWorkQueue();
   VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << src
@@ -348,7 +384,7 @@ Scope* NewIRInterpreter::InnerScope() {
 }
 
 std::string NewIRInterpreter::GetNameByValue(::pir::Value value) const {
-  return value_2_var_name_.at(value);
+  return value_exe_info_->GetValue2VarName().at(value);
 }
 
 void NewIRInterpreter::UpdateSyncOpNum() {
@@ -534,9 +570,7 @@ void NewIRInterpreter::BuildInstruction() {
                                             op,
                                             scope_,
                                             local_scope_,
-                                            value_2_var_name_,
-                                            var_name_2_id_,
-                                            variable_2_var_name_,
+                                            value_exe_info_.get(),
                                             sub_blocks_));
     } else if (op->dialect()->name() == "pd_kernel") {
       auto op_name = op->attributes()
@@ -551,24 +585,26 @@ void NewIRInterpreter::BuildInstruction() {
 
       if (op->name().compare(paddle::dialect::LegacyKernelOp::name()) == 0) {
         vec_instruction_base_.emplace_back(
-            std::make_unique<LegacyKernelInstruction>(op_idx++,
-                                                      place_,
-                                                      op,
-                                                      scope_,
-                                                      local_scope_,
-                                                      value_2_var_name_,
-                                                      var_name_2_id_,
-                                                      variable_2_var_name_));
+            std::make_unique<LegacyKernelInstruction>(
+                op_idx++,
+                place_,
+                op,
+                scope_,
+                local_scope_,
+                value_exe_info_->GetValue2VarName(),
+                value_exe_info_->GetVarName2Id(),
+                value_exe_info_->GetVar2VarName()));
       } else {
         vec_instruction_base_.emplace_back(
-            std::make_unique<PhiKernelInstruction>(op_idx++,
-                                                   place_,
-                                                   op,
-                                                   scope_,
-                                                   local_scope_,
-                                                   value_2_var_name_,
-                                                   var_name_2_id_,
-                                                   variable_2_var_name_));
+            std::make_unique<PhiKernelInstruction>(
+                op_idx++,
+                place_,
+                op,
+                scope_,
+                local_scope_,
+                value_exe_info_->GetValue2VarName(),
+                value_exe_info_->GetVarName2Id(),
+                value_exe_info_->GetVar2VarName()));
       }
 #ifdef PADDLE_WITH_CINN
     } else if (op->dialect()->name() == "cinn_runtime") {
@@ -588,14 +624,15 @@ std::string NewIRInterpreter::DebugValueInfo() {
      << "value -> var_name -> id -> variable*"
      << "\n";
 
-  interpreter::PrintValuesAndVariables(
-      *ir_block_, &value_2_var_name_, &variable_2_var_name_);
+  interpreter::PrintValuesAndVariables(*ir_block_,
+                                       value_exe_info_->GetValue2VarName(),
+                                       value_exe_info_->GetVar2VarName());
 
-  for (auto kv : value_2_var_name_) {
+  for (auto kv : value_exe_info_->GetValue2VarName()) {
     PADDLE_ENFORCE((bool)kv.first,
                    platform::errors::PreconditionNotMet(
                        "vlaue(%s) should not be nullptr", kv.second));
-    PADDLE_ENFORCE(var_name_2_id_.count(kv.second) > 0,
+    PADDLE_ENFORCE(value_exe_info_->GetVarName2Id().count(kv.second) > 0,
                    platform::errors::PreconditionNotMet(
                        "var(%s) should exist in var_name_2_id_", kv.second));
     auto* var = InnerScope()->FindVar(kv.second);
@@ -604,7 +641,8 @@ std::string NewIRInterpreter::DebugValueInfo() {
         platform::errors::PreconditionNotMet(
             "var(%s) should exist in scope (%p)", kv.second, InnerScope()));
     os << kv.first.impl() << " -> " << kv.second << " -> "
-       << var_name_2_id_.at(kv.second) << " -> " << var << "\n";
+       << value_exe_info_->GetVarName2Id().at(kv.second) << " -> " << var
+       << "\n";
   }
   return os.str();
 }
@@ -743,15 +781,16 @@ void NewIRInterpreter::RecordStreamForGC(InstructionBase* instr) {
    * supported later.
    */
   for (int var_id : instr->GCCheckVars()) {
-    VLOG(4) << "GC sync " << GetNameById(var_id);
+    VLOG(4) << "GC sync " << value_exe_info_->GetNameById(var_id);
 
     // persistable var will be ignore while GC
-    if (parameter_var_names_.count(GetNameById(var_id))) {
-      VLOG(4) << GetNameById(var_id) << " is a parameter, skip gc";
+    if (parameter_var_names_.count(value_exe_info_->GetNameById(var_id))) {
+      VLOG(4) << value_exe_info_->GetNameById(var_id)
+              << " is a parameter, skip gc";
       continue;
     }
 
-    paddle::framework::Variable* var = variable_list_[var_id];
+    paddle::framework::Variable* var = value_exe_info_->GetVarList()[var_id];
     if (var == nullptr) {
       continue;
     }
@@ -791,19 +830,20 @@ void NewIRInterpreter::CheckGC(InstructionBase* instr) {
 #endif
 
   for (auto var_id : instr->GCCheckVars()) {
-    VLOG(4) << "GC:" << GetNameById(static_cast<int>(var_id))
+    VLOG(4) << "GC:" << value_exe_info_->GetNameById(static_cast<int>(var_id))
             << ", id:" << var_id << ", ref:" << refs_[var_id]->DynamicRef();
     bool is_ready = refs_[var_id]->CheckAndDecrease();
     // ignore all persistable var while GCphi
-    if (parameter_var_names_.count(GetNameById(static_cast<int>(var_id)))) {
-      VLOG(4) << GetNameById(static_cast<int>(var_id))
+    if (parameter_var_names_.count(
+            value_exe_info_->GetNameById(static_cast<int>(var_id)))) {
+      VLOG(4) << value_exe_info_->GetNameById(static_cast<int>(var_id))
               << " is a parameter, skip gc";
       continue;
     }
 
     if (is_ready) {
       VLOG(6) << "Async delete variable with name : "
-              << GetNameById(static_cast<int>(var_id));
+              << value_exe_info_->GetNameById(static_cast<int>(var_id));
       gc_->Add(refs_[var_id]->Var(), instr);
     }
   }
@@ -838,13 +878,14 @@ void NewIRInterpreter::CalculateLastLiveOps() {
 
     for (auto var_id : gc_check_vars) {
       Scope* inner_scope = InnerScope();
-      paddle::framework::Variable* var =
-          inner_scope->FindVar(GetNameById(static_cast<int>(var_id)));
+      paddle::framework::Variable* var = inner_scope->FindVar(
+          value_exe_info_->GetNameById(static_cast<int>(var_id)));
       if (var->IsType<phi::DenseTensor>() || var->IsType<phi::SelectedRows>() ||
           var->IsType<LoDTensorArray>()) {
         last_live_ops_[var_id].insert(op_idx);
       } else {
-        VLOG(4) << "not clear " << GetNameById(static_cast<int>(var_id))
+        VLOG(4) << "not clear "
+                << value_exe_info_->GetNameById(static_cast<int>(var_id))
                 << " after " << instr->Name() << " because its type is "
                 << framework::ToTypeName(var->Type());
       }
@@ -852,7 +893,7 @@ void NewIRInterpreter::CalculateLastLiveOps() {
   }
   // clear the last_live_ops list for all vars in skip_gc_vars
   for (const std::string& skip_gc_var : execution_config_.skip_gc_vars) {
-    int var_id = GetIdByName(skip_gc_var);
+    int var_id = value_exe_info_->GetIdByName(skip_gc_var);
     if (var_id != -1) {
       last_live_ops_[var_id].clear();
       VLOG(8) << "Skip gc for var: " << skip_gc_var;
@@ -867,7 +908,7 @@ void NewIRInterpreter::CalculateLastLiveOps() {
   // c = op2(a, b)
   // in this case, a is the input of op1 and op2, we only need to check
   // a after op2, because op2 always uses a after op1.
-  var_ref_count_.resize(variable_list_.size());
+  var_ref_count_.resize(value_exe_info_->GetVarList().size());
   VLOG(4) << "last_live_ops_.size() : " << last_live_ops_.size();
   for (auto kv : last_live_ops_) {
     for (auto val : kv.second) {
@@ -890,8 +931,8 @@ void NewIRInterpreter::CalculateLastLiveOps() {
       }
       if (not_before_any) {
         VLOG(6) << "last live op of var " << i << " "
-                << GetNameById(static_cast<int>(i)) << " : " << item << " "
-                << vec_instruction_base_[item]->Name();
+                << value_exe_info_->GetNameById(static_cast<int>(i)) << " : "
+                << item << " " << vec_instruction_base_[item]->Name();
         minumum_last_live_ops.insert(item);
         vec_instruction_base_[item]->AddGCCheckVar(i);
       }
@@ -903,9 +944,9 @@ void NewIRInterpreter::CalculateLastLiveOps() {
   for (auto& dep : *dependecy_count_) {
     deps_.emplace_back(std::make_shared<interpreter::OpDepInfo>(dep));
   }
-  for (size_t i = 0; i < variable_list_.size(); ++i) {
+  for (size_t i = 0; i < value_exe_info_->GetVarList().size(); ++i) {
     refs_.emplace_back(std::make_shared<interpreter::VarRefInfo>(
-        var_ref_count_[i], variable_list_[i]));
+        var_ref_count_[i], value_exe_info_->GetVarList()[i]));
   }
 }
 
@@ -917,7 +958,7 @@ void NewIRInterpreter::ConstructEventForJitInput() {
           platform::is_gpu_place(place_)) {
         for (auto& item : inst->Inputs()) {
           for (auto var_id : item.second) {
-            auto name = GetNameById(var_id);
+            auto name = value_exe_info_->GetNameById(var_id);
             if (JitInputVars().count(name)) {
               auto device_event = std::make_shared<platform::DeviceEvent>(
                   place_, platform::GenerateDeviceEventFlag());
@@ -1378,7 +1419,8 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
       interpreter::LogDeviceMemoryStats(place_);
     }
     VLOG(4) << place_ << " "
-            << instr_node->DebugStringEx(scope_, value_2_var_name_);
+            << instr_node->DebugStringEx(scope_,
+                                         value_exe_info_->GetValue2VarName());
     VLOG(5) << "after run kernel";
     instr_node->RecordEvent(place_);
   } catch (platform::EnforceNotMet& ex) {
@@ -1435,7 +1477,7 @@ void NewIRInterpreter::Build(
 }
 
 ::pir::Value NewIRInterpreter::GetValueByName(const std::string& var_name) {
-  for (auto kv : value_2_var_name_) {
+  for (auto kv : value_exe_info_->GetValue2VarName()) {
     if (kv.second == var_name) {
       return kv.first;
     }
@@ -1445,7 +1487,7 @@ ::pir::Value NewIRInterpreter::GetValueByName(const std::string& var_name) {
 
 void NewIRInterpreter::SolvePersisableVarNames() {
   VLOG(6) << "SolvePersisableVarNames";
-  for (auto kv : value_2_var_name_) {
+  for (auto kv : value_exe_info_->GetValue2VarName()) {
     ::pir::Value value = kv.first;
     const std::string& var_name = kv.second;
     ::pir::OpResult result = value.dyn_cast<::pir::OpResult>();
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index c05eb6770b2ba..04a149bb6d692 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -24,7 +24,7 @@ class Block;
 
 namespace paddle {
 namespace framework {
-
+class ValueExecutionInfo;
 class NewIRInterpreter : public InterpreterBaseImpl {
   using ExecutionConfig = interpreter::ExecutionConfig;
   using InstructionSchedulingPriorityLess = std::function<bool(size_t, size_t)>;
@@ -40,6 +40,13 @@ class NewIRInterpreter : public InterpreterBaseImpl {
                    Scope* scope,
                    const ExecutionConfig& execution_config = ExecutionConfig());
 
+  NewIRInterpreter(const platform::Place& place,
+                   const std::vector<std::string>& fetch_var_names,
+                   const ::pir::Block* ir_block,
+                   Scope* scope,
+                   std::shared_ptr<ValueExecutionInfo> value_exe_info,
+                   const ExecutionConfig& execution_config = ExecutionConfig());
+
   ~NewIRInterpreter();
 
   paddle::framework::FetchList Run(
@@ -77,10 +84,6 @@ class NewIRInterpreter : public InterpreterBaseImpl {
     hookfuncs_ = hookfuncs;
   }
 
-  std::string GetNameById(int id) const;
-
-  int GetIdByName(const std::string& name) const;
-
   std::string GetNameByValue(::pir::Value value) const;
 
  private:
@@ -210,15 +213,9 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
   std::vector<std::unique_ptr<InstructionBase>> vec_instruction_base_;
 
-  std::unordered_map<::pir::Value, std::string> value_2_var_name_;
-
-  std::unordered_map<const paddle::framework::Variable*, std::string>
-      variable_2_var_name_;
-
-  std::map<std::string, int> var_name_2_id_;
-  std::unordered_map<int, std::string> id_2_var_name_;
+  // value execution info
+  std::shared_ptr<ValueExecutionInfo> value_exe_info_;
 
-  std::vector<Variable*> variable_list_;
   std::map<pir::Block*, paddle::framework::Scope*> sub_blocks_;
 
   std::vector<int> var_ref_count_;
diff --git a/paddle/fluid/pir/phi_kernel_adaptor/CMakeLists.txt b/paddle/fluid/framework/new_executor/pir_adaptor/CMakeLists.txt
similarity index 64%
rename from paddle/fluid/pir/phi_kernel_adaptor/CMakeLists.txt
rename to paddle/fluid/framework/new_executor/pir_adaptor/CMakeLists.txt
index e1f8db179be6b..f66f96b7409fe 100644
--- a/paddle/fluid/pir/phi_kernel_adaptor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/CMakeLists.txt
@@ -1,7 +1,7 @@
 # All source files of pd_op_dialect, except for the source file of op, which is generated in the compilation directory.
-file(GLOB PHI_KERNEL_ADAPTOR_SRCS "*.cc")
+file(GLOB PIR_ADAPTOR_SRCS "*.cc")
 
 cc_library(
-  phi_kernel_adaptor
-  SRCS ${PHI_KERNEL_ADAPTOR_SRCS}
+  pir_adaptor
+  SRCS ${PIR_ADAPTOR_SRCS}
   DEPS program_translator pd_kernel_dialect)
diff --git a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
similarity index 73%
rename from paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
rename to paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
index 437523e41bf3e..ef7a9c763e753 100644
--- a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
@@ -45,6 +45,89 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 
+namespace paddle {
+namespace framework {
+std::shared_ptr<ValueExecutionInfo> ValueExecutionInfo::NewChild(Scope* scope) {
+  std::shared_ptr<ValueExecutionInfo> info =
+      std::make_shared<ValueExecutionInfo>(scope);
+  info->parent_ = this;
+  return info;
+}
+
+void ValueExecutionInfo::Add(::pir::Value value, std::string var_name) {
+  auto* var = scope_->FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound("Cannot find %s in scope.", var_name));
+
+  if (value_2_var_name_.count(value) == 0) {
+    value_2_var_name_.emplace(value, var_name);
+  }
+
+  var_2_var_name_.emplace(var, var_name);
+
+  if (var_name_2_id_.count(var_name) == 0) {
+    auto id = var_name_2_id_.size();
+    var_name_2_id_.emplace(var_name, id);
+    id_2_var_name_.emplace(id, var_name);
+    var_list_.push_back(var);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      var_list_.size(),
+      var_name_2_id_.size(),
+      paddle::platform::errors::InvalidArgument(
+          "The size of variable_list and var_name_2_id map should be equal"));
+}
+
+void ValueExecutionInfo::Rename(pir::Value value,
+                                std::string new_name,
+                                std::string orig_name) {
+  value_2_var_name_[value] = new_name;
+
+  for (auto kv : value_2_var_name_) {
+    if (kv.second == orig_name) {
+      value_2_var_name_[kv.first] = new_name;
+    }
+  }
+
+  for (auto kv : var_2_var_name_) {
+    if (kv.second == orig_name) {
+      var_2_var_name_[kv.first] = new_name;
+    }
+  }
+
+  for (auto kv : var_name_2_id_) {
+    if (kv.first == orig_name) {
+      var_name_2_id_.emplace(new_name, kv.second);
+      id_2_var_name_[kv.second] = new_name;
+    }
+  }
+  var_name_2_id_.erase(orig_name);
+}
+
+int ValueExecutionInfo::GetIdByName(const std::string& name) const {
+  auto it = var_name_2_id_.find(name);
+  if (it != var_name_2_id_.end()) {
+    return it->second;
+  }
+  return -1;
+}
+
+std::string ValueExecutionInfo::GetNameById(int id) const {
+  // NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since
+  // vec_meta_info_[id] may be nullptr,
+  // typically when the target variable is not existed in the original program
+  // desc, but created by interpretercore.
+  // For example, created and used by d2h_copy or h2d_copy operator.
+  auto it = id_2_var_name_.find(id);
+  if (it != id_2_var_name_.end()) {
+    return it->second;
+  }
+  return "";
+}
+}  // namespace framework
+}  // namespace paddle
+
 namespace pir {
 
 const std::unordered_set<std::string> SpecialOps = {"pd_op.feed",
@@ -117,14 +200,9 @@ using VariableNameMap =
 
 paddle::framework::Variable* CreateVar(
     pir::Value value,
-    paddle::framework::Scope* inner_scope,
     const std::string& var_name_prefix,
     bool force_persisable,
-    std::unordered_map<pir::Value, std::string>* value_2_var_name,
-    std::unordered_map<const paddle::framework::Variable*, std::string>*
-        variable_2_var_name,
-    std::map<std::string, int>* var_name_2_id,
-    std::vector<paddle::framework::Variable*>* variable_list) {
+    paddle::framework::ValueExecutionInfo* value_exe_info) {
   Operation* def_op = value.dyn_cast<OpResult>().owner();
   bool is_persisable = false;
   if (def_op->isa<::pir::SetParameterOp>()) {
@@ -134,22 +212,29 @@ paddle::framework::Variable* CreateVar(
   paddle::framework::Variable* var = nullptr;
 
   std::string name = var_name_prefix + "_inner_var_" +
-                     std::to_string(variable_2_var_name->size());
+                     std::to_string(value_exe_info->GetVar2VarName().size());
 
   if (force_persisable || is_persisable) {
-    VLOG(6) << "Create var: " << name << " in scope " << inner_scope->root();
-    var = const_cast<paddle::framework::Scope*>(inner_scope->root())->Var(name);
+    VLOG(6) << "Create var: " << name << " in scope "
+            << value_exe_info->GetScope()->root();
+    var = const_cast<paddle::framework::Scope*>(
+              value_exe_info->GetScope()->root())
+              ->Var(name);
   } else {
-    VLOG(6) << "Create var: " << name << " in scope " << inner_scope;
-    var = inner_scope->Var(name);
-  }
-  AddNewData(value,
-             name,
-             var,
-             value_2_var_name,
-             variable_2_var_name,
-             var_name_2_id,
-             variable_list);
+    VLOG(6) << "Create var: " << name << " in scope "
+            << value_exe_info->GetScope();
+    var = value_exe_info->GetScope()->Var(name);
+  }
+  // AddNewData(value,
+  //            name,
+  //            var,
+  //            value_2_var_name,
+  //            variable_2_var_name,
+  //            var_name_2_id,
+  //            variable_list);
+
+  value_exe_info->Add(value, name);
+
   return var;
 }
 
@@ -175,25 +260,14 @@ void CheckInputVars(
 }
 
 void BuildValue(pir::Value value,
-                paddle::framework::Scope* inner_scope,
                 const std::string& var_name_prefix,
-                std::unordered_map<pir::Value, std::string>* value_2_var_name,
-                std::unordered_map<const paddle::framework::Variable*,
-                                   std::string>* variable_2_var_name,
-                std::map<std::string, int>* var_name_2_id,
-                std::vector<paddle::framework::Variable*>* variable_list) {
+                paddle::framework::ValueExecutionInfo* value_exe_info) {
   paddle::framework::Variable* var = nullptr;
-  if (value_2_var_name->find(value) != value_2_var_name->end()) {
-    var = inner_scope->FindVar(value_2_var_name->at(value));
+  auto& value_2_var_name = value_exe_info->GetValue2VarName();
+  if (value_2_var_name.find(value) != value_2_var_name.end()) {
+    var = value_exe_info->GetScope()->FindVar(value_2_var_name.at(value));
   } else {
-    var = CreateVar(value,
-                    inner_scope,
-                    var_name_prefix,
-                    false,
-                    value_2_var_name,
-                    variable_2_var_name,
-                    var_name_2_id,
-                    variable_list);
+    var = CreateVar(value, var_name_prefix, false, value_exe_info);
   }
   // Only support DenseTensor or Vector<DenseTensor>
   if (!value.type() ||
@@ -211,14 +285,7 @@ void BuildValue(pir::Value value,
                      paddle::platform::errors::Fatal(
                          "Element of VectorType output only support "
                          "DenseTensorType"));
-      auto var_i = CreateVar(value,
-                             inner_scope,
-                             var_name_prefix,
-                             false,
-                             value_2_var_name,
-                             variable_2_var_name,
-                             var_name_2_id,
-                             variable_list);
+      auto var_i = CreateVar(value, var_name_prefix, false, value_exe_info);
 
       var_i->GetMutable<phi::DenseTensor>();
       tensor_array->emplace_back(var_i);
@@ -231,14 +298,9 @@ void BuildValue(pir::Value value,
 
 void HandleForSpecialOp(
     pir::Operation* op,
-    paddle::framework::Scope* inner_scope,
     const std::string& var_name_prefix,
-    std::unordered_map<pir::Value, std::string>* value_2_var_name,
-    std::unordered_map<const paddle::framework::Variable*, std::string>*
-        variable_2_var_name,
-    std::map<std::string, int>* var_name_2_id,
-    std::vector<paddle::framework::Variable*>* variable_list,
-    std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks) {
+    std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks,
+    paddle::framework::ValueExecutionInfo* value_exe_info) {
   std::string op_name = op->name();
   if (op->attributes().count("op_name")) {
     op_name =
@@ -251,56 +313,54 @@ void HandleForSpecialOp(
         op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
 
     auto fetch_var_name = fetch_src_name + "@fetch";
-    auto* var = const_cast<paddle::framework::Scope*>(inner_scope->root())
+    auto* var = const_cast<paddle::framework::Scope*>(
+                    value_exe_info->GetScope()->root())
                     ->Var(fetch_var_name);
     var->GetMutable<phi::DenseTensor>();
     auto value = op->result(0);
 
-    AddNewData(value,
-               fetch_var_name,
-               var,
-               value_2_var_name,
-               variable_2_var_name,
-               var_name_2_id,
-               variable_list);
+    // AddNewData(value,
+    //            fetch_var_name,
+    //            var,
+    //            value_2_var_name,
+    //            variable_2_var_name,
+    //            var_name_2_id,
+    //            variable_list);
+    value_exe_info->Add(value, fetch_var_name);
   }
 
   if (op_name == "pd_op.feed" || op_name == "pd_op.data") {
     VLOG(6) << "Handle for" << op_name;
     auto value = op->result(0);
-    VLOG(6) << "link feed output to feed in variable" << inner_scope;
+    VLOG(6) << "link feed output to feed in variable"
+            << value_exe_info->GetScope();
 
     std::string name =
         op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
-    paddle::framework::Variable* var = inner_scope->Var(name);
+    paddle::framework::Variable* var = value_exe_info->GetScope()->Var(name);
     PADDLE_ENFORCE(var,
                    paddle::platform::errors::InvalidArgument(
                        "The variable %s shoud exist", name));
 
-    AddNewData(value,
-               name,
-               var,
-               value_2_var_name,
-               variable_2_var_name,
-               var_name_2_id,
-               variable_list);
+    // AddNewData(value,
+    //            name,
+    //            var,
+    //            value_2_var_name,
+    //            variable_2_var_name,
+    //            var_name_2_id,
+    //            variable_list);
+    value_exe_info->Add(value, name);
   }
 
   if (op_name == "builtin.combine") {
     auto out_value = op->result(0);
 
     paddle::framework::Variable* var = nullptr;
-    if (value_2_var_name->find(out_value) != value_2_var_name->end()) {
-      var = inner_scope->FindVar(value_2_var_name->at(out_value));
+    auto& value_2_var_name = value_exe_info->GetValue2VarName();
+    if (value_2_var_name.find(out_value) != value_2_var_name.end()) {
+      var = value_exe_info->GetScope()->FindVar(value_2_var_name.at(out_value));
     } else {
-      var = CreateVar(out_value,
-                      inner_scope,
-                      var_name_prefix,
-                      false,
-                      value_2_var_name,
-                      variable_2_var_name,
-                      var_name_2_id,
-                      variable_list);
+      var = CreateVar(out_value, var_name_prefix, false, value_exe_info);
     }
 
     auto tensor_array = var->GetMutable<paddle::framework::VariableRefArray>();
@@ -310,11 +370,11 @@ void HandleForSpecialOp(
     for (size_t i = 0; i < input_num; ++i) {
       auto value = op->operand_source(i);
       PADDLE_ENFORCE_EQ(
-          value_2_var_name->count(value),
+          value_2_var_name.count(value),
           true,
           phi::errors::PreconditionNotMet("can not found input of combine op"));
       tensor_array->emplace_back(
-          inner_scope->FindVar(value_2_var_name->at(value)));
+          value_exe_info->GetScope()->FindVar(value_2_var_name.at(value)));
     }
   }
 
@@ -327,7 +387,7 @@ void HandleForSpecialOp(
 
     auto value = op->operand_source(0);
     // change opreand name to param_name
-    auto orig_name = value_2_var_name->at(value);
+    auto orig_name = value_exe_info->GetValue2VarName().at(value);
 
     PADDLE_ENFORCE_NE(
         param_name,
@@ -335,19 +395,20 @@ void HandleForSpecialOp(
         phi::errors::PreconditionNotMet(
             "SetParamer param name should not equal with var name"));
 
-    if (inner_scope->root()->FindVar(param_name) == nullptr) {
-      const_cast<paddle::framework::Scope*>(inner_scope->root())
+    if (value_exe_info->GetScope()->root()->FindVar(param_name) == nullptr) {
+      const_cast<paddle::framework::Scope*>(value_exe_info->GetScope()->root())
           ->Rename(orig_name, param_name);
       VLOG(6) << "set_parameter rename var: " << orig_name << " -> "
               << param_name;
     }
 
-    RenameData(value,
-               param_name,
-               orig_name,
-               value_2_var_name,
-               variable_2_var_name,
-               var_name_2_id);
+    // RenameData(value,
+    //            param_name,
+    //            orig_name,
+    //            value_2_var_name,
+    //            variable_2_var_name,
+    //            var_name_2_id);
+    value_exe_info->Rename(value, param_name, orig_name);
   }
 
   if (op_name == "pd_op.shadow_output") {
@@ -357,18 +418,19 @@ void HandleForSpecialOp(
 
     auto value = op->operand_source(0);
     // change opreand name to param_name
-    auto orig_name = value_2_var_name->at(value);
+    auto orig_name = value_exe_info->GetValue2VarName().at(value);
 
-    if (inner_scope->root()->FindVar(var_name) == nullptr) {
-      const_cast<paddle::framework::Scope*>(inner_scope->root())
+    if (value_exe_info->GetScope()->root()->FindVar(var_name) == nullptr) {
+      const_cast<paddle::framework::Scope*>(value_exe_info->GetScope()->root())
           ->Rename(orig_name, var_name);
     }
-    RenameData(value,
-               var_name,
-               orig_name,
-               value_2_var_name,
-               variable_2_var_name,
-               var_name_2_id);
+    // RenameData(value,
+    //            var_name,
+    //            orig_name,
+    //            value_2_var_name,
+    //            variable_2_var_name,
+    //            var_name_2_id);
+    value_exe_info->Rename(value, var_name, orig_name);
   }
 
   if (op_name == "builtin.get_parameter") {
@@ -379,62 +441,67 @@ void HandleForSpecialOp(
                           .AsString();
     auto value = op->result(0);
 
-    paddle::framework::Variable* var = inner_scope->FindVar(param_name);
-    AddNewData(value,
-               param_name,
-               var,
-               value_2_var_name,
-               variable_2_var_name,
-               var_name_2_id,
-               variable_list);
+    // paddle::framework::Variable* var =
+    // value_exe_info->GetScope()->FindVar(param_name); AddNewData(value,
+    //            param_name,
+    //            var,
+    //            value_2_var_name,
+    //            variable_2_var_name,
+    //            var_name_2_id,
+    //            variable_list);
+    value_exe_info->Add(value, param_name);
   }
 
   if (op_name == "builtin.slice") {
     VLOG(6) << "Handle for builtin.slice";
     auto out_value = op->result(0);
     auto in_value = op->operand_source(0);
-    PADDLE_ENFORCE_EQ(value_2_var_name->count(in_value),
+    PADDLE_ENFORCE_EQ(value_exe_info->GetValue2VarName().count(in_value),
                       true,
                       phi::errors::PreconditionNotMet(
                           "input of buildin slice not in name map"));
 
     int index =
         op->attributes().at("index").dyn_cast<pir::Int32Attribute>().data();
-    auto in_var = inner_scope->FindVar(value_2_var_name->at(in_value));
+    auto in_var = value_exe_info->GetScope()->FindVar(
+        value_exe_info->GetValue2VarName().at(in_value));
     auto variable_array = in_var->Get<paddle::framework::VariableRefArray>();
 
     PADDLE_ENFORCE_EQ(
-        variable_2_var_name->count(variable_array[index]),
+        value_exe_info->GetVar2VarName().count(variable_array[index]),
         true,
         phi::errors::PreconditionNotMet("[%d] the variable in build slice "
                                         "input MUST in variable name map",
                                         index));
 
-    std::string var_name = variable_2_var_name->at(variable_array[index]);
-    value_2_var_name->emplace(out_value, var_name);
+    std::string var_name =
+        value_exe_info->GetVar2VarName().at(variable_array[index]);
+    value_exe_info->AddValue2VarName(out_value, var_name);
   }
 
   if (op_name == "builtin.split") {
     VLOG(6) << "Handle for builtin.split";
     auto in_value = op->operand_source(0);
-    PADDLE_ENFORCE_EQ(value_2_var_name->count(in_value),
+    PADDLE_ENFORCE_EQ(value_exe_info->GetValue2VarName().count(in_value),
                       true,
                       phi::errors::PreconditionNotMet(
                           "input of buildin split not in name map"));
 
-    auto in_var = inner_scope->FindVar(value_2_var_name->at(in_value));
+    auto in_var = value_exe_info->GetScope()->FindVar(
+        value_exe_info->GetValue2VarName().at(in_value));
     auto variable_array = in_var->Get<paddle::framework::VariableRefArray>();
 
     for (uint64_t idx = 0; idx < variable_array.size(); ++idx) {
       auto out_value = op->result(idx);
       PADDLE_ENFORCE_EQ(
-          variable_2_var_name->count(variable_array[idx]),
+          value_exe_info->GetVar2VarName().count(variable_array[idx]),
           true,
           phi::errors::PreconditionNotMet("[%d] the variable in build split "
                                           "input MUST in variable name map",
                                           idx));
-      std::string var_name = variable_2_var_name->at(variable_array[idx]);
-      value_2_var_name->emplace(out_value, var_name);
+      std::string var_name =
+          value_exe_info->GetVar2VarName().at(variable_array[idx]);
+      value_exe_info->AddValue2VarName(out_value, var_name);
     }
   }
 
@@ -445,36 +512,24 @@ void HandleForSpecialOp(
 
     auto false_block = if_op.false_block();
 
-    auto& true_branch_scope = inner_scope->NewScope();
+    auto& true_branch_scope = value_exe_info->GetScope()->NewScope();
     sub_blocks->emplace(true_block, &true_branch_scope);
 
-    auto& false_branch_scope = inner_scope->NewScope();
+    auto& false_branch_scope = value_exe_info->GetScope()->NewScope();
     sub_blocks->emplace(false_block, &false_branch_scope);
 
     for (size_t i = 0; i < if_op->num_results(); ++i) {
       // auto true_value = true_yeid_op->operand_source(i);
 
       auto if_op_out_value = if_op->result(i);
-      BuildValue(if_op_out_value,
-                 inner_scope,
-                 var_name_prefix,
-                 value_2_var_name,
-                 variable_2_var_name,
-                 var_name_2_id,
-                 variable_list);
+      BuildValue(if_op_out_value, var_name_prefix, value_exe_info);
     }
   }
 }
 
-void HandleForInplaceOp(
-    pir::Operation* op,
-    paddle::framework::Scope* inner_scope,
-    const std::string& var_name_prefix,
-    std::unordered_map<pir::Value, std::string>* value_2_var_name,
-    std::unordered_map<const paddle::framework::Variable*, std::string>*
-        variable_2_var_name,
-    std::map<std::string, int>* var_name_2_id,
-    std::vector<paddle::framework::Variable*>* variable_list) {
+void HandleForInplaceOp(pir::Operation* op,
+                        const std::string& var_name_prefix,
+                        paddle::framework::ValueExecutionInfo* value_exe_info) {
   if (op->num_results() < 1) return;
   pir::IrContext* ctx = pir::IrContext::Instance();
   std::string op_name = op->name();
@@ -498,26 +553,23 @@ void HandleForInplaceOp(
       const std::string& inplace_name = yaml_parser.InplaceName(value_name);
       pir::Value inplace_value =
           op->operand_source(yaml_parser.InputName2Id().at(inplace_name));
-      std::string var_name = value_2_var_name->at(inplace_value);
+      std::string var_name =
+          value_exe_info->GetValue2VarName().at(inplace_value);
       VLOG(4) << "inplace: " << value_name << " -> " << inplace_name
               << " (var: " << var_name << ")";
-      value_2_var_name->emplace(value, var_name);
+      value_exe_info->AddValue2VarName(value, var_name);
     } else if (yaml_parser.HasView(value_name)) {
       const std::string& view_name = yaml_parser.ViewName(value_name);
       pir::Value view_value =
           op->operand_source(yaml_parser.InputName2Id().at(view_name));
-      const std::string& var_name = value_2_var_name->at(view_value);
+      // const std::string& var_name = value_2_var_name->at(view_value);
+      const std::string& var_name =
+          value_exe_info->GetValue2VarName().at(view_value);
       VLOG(4) << "view: " << value_name << " -> " << view_name
               << " (var: " << var_name << ")";
-      value_2_var_name->emplace(value, var_name);
+      value_exe_info->AddValue2VarName(value, var_name);
     } else {
-      BuildValue(value,
-                 inner_scope,
-                 var_name_prefix,
-                 value_2_var_name,
-                 variable_2_var_name,
-                 var_name_2_id,
-                 variable_list);
+      BuildValue(value, var_name_prefix, value_exe_info);
     }
   }
 }
@@ -525,18 +577,14 @@ void HandleForInplaceOp(
 // NOTE(zhiqiu): the persistable is created in inner_scope's root, and other
 // is created in inner_scope.
 void BuildScope(const pir::Block& block,
-                paddle::framework::Scope* inner_scope,
                 const std::string& var_name_prefix,
-                std::unordered_map<pir::Value, std::string>* value_2_var_name,
-                std::unordered_map<const paddle::framework::Variable*,
-                                   std::string>* variable_2_var_name,
-                std::map<std::string, int>* var_name_2_id,
-                std::vector<paddle::framework::Variable*>* variable_list,
-                std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks) {
+                std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks,
+                paddle::framework::ValueExecutionInfo* value_exe_info) {
   VLOG(4) << "***** [before build] scope"
-          << "(" << inner_scope << ") ******\n"
+          << "(" << value_exe_info->GetScope() << ") ******\n"
           << paddle::framework::GenScopeTreeDebugInfo(
-                 const_cast<paddle::framework::Scope*>(inner_scope->root()));
+                 const_cast<paddle::framework::Scope*>(
+                     value_exe_info->GetScope()->root()));
 
   for (auto op : block) {
     std::string op_name = op->name();
@@ -548,18 +596,11 @@ void BuildScope(const pir::Block& block,
     }
     VLOG(4) << "build op:" << op_name;
     if (SpecialOps.count(op_name)) {
-      HandleForSpecialOp(op,
-                         inner_scope,
-                         var_name_prefix,
-                         value_2_var_name,
-                         variable_2_var_name,
-                         var_name_2_id,
-                         variable_list,
-                         sub_blocks);
+      HandleForSpecialOp(op, var_name_prefix, sub_blocks, value_exe_info);
       continue;
     }
 
-    CheckInputVars(op, op_name, *value_2_var_name);
+    CheckInputVars(op, op_name, value_exe_info->GetValue2VarName());
 
     if (op->num_results() < 1) continue;
     if (op->attributes().count("is_inplace") != 0 &&
@@ -567,31 +608,20 @@ void BuildScope(const pir::Block& block,
             .at("is_inplace")
             .dyn_cast<pir::BoolAttribute>()
             .data()) {
-      HandleForInplaceOp(op,
-                         inner_scope,
-                         var_name_prefix,
-                         value_2_var_name,
-                         variable_2_var_name,
-                         var_name_2_id,
-                         variable_list);
+      HandleForInplaceOp(op, var_name_prefix, value_exe_info);
       continue;
     } else {
       for (size_t i = 0; i < op->num_results(); ++i) {
-        BuildValue(op->result(i),
-                   inner_scope,
-                   var_name_prefix,
-                   value_2_var_name,
-                   variable_2_var_name,
-                   var_name_2_id,
-                   variable_list);
+        BuildValue(op->result(i), var_name_prefix, value_exe_info);
       }
     }
   }
 
   VLOG(4) << "***** [after build] scope"
-          << "(" << inner_scope << ") ******\n"
+          << "(" << value_exe_info->GetScope() << ") ******\n"
           << paddle::framework::GenScopeTreeDebugInfo(
-                 const_cast<paddle::framework::Scope*>(inner_scope->root()));
+                 const_cast<paddle::framework::Scope*>(
+                     value_exe_info->GetScope()->root()));
 }
 
 void BuildRuntimeContext(
diff --git a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
similarity index 88%
rename from paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h
rename to paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
index c3a409b8027b6..45e48832d58e6 100644
--- a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
@@ -42,16 +42,82 @@
 
 #include "glog/logging.h"
 
+namespace paddle {
+namespace framework {
+
+class CondInstruction;
+class ValueExecutionInfo {
+ public:
+  explicit ValueExecutionInfo(Scope* scope) : scope_(scope) {}
+
+  const ValueExecutionInfo* Parent() const { return parent_; }
+
+  Scope* GetScope() { return scope_; }
+
+  void Add(::pir::Value value, std::string var_name);
+
+  void Rename(pir::Value value, std::string new_name, std::string orig_name);
+
+  int GetIdByName(const std::string& name) const;
+
+  std::string GetNameById(int id) const;
+
+  const std::unordered_map<::pir::Value, std::string>& GetValue2VarName()
+      const {
+    return value_2_var_name_;
+  }
+
+  void AddValue2VarName(::pir::Value value, const std::string& var_name) {
+    value_2_var_name_.emplace(value, var_name);
+  }
+
+  const std::unordered_map<const paddle::framework::Variable*, std::string>&
+  GetVar2VarName() const {
+    return var_2_var_name_;
+  }
+
+  const std::map<std::string, int>& GetVarName2Id() const {
+    return var_name_2_id_;
+  }
+
+  const std::unordered_map<int, std::string>& GetId2VarName() const {
+    return id_2_var_name_;
+  }
+
+  const std::vector<Variable*>& GetVarList() const { return var_list_; }
+
+  void ResetVarList(int id, Variable* var) { var_list_[id] = var; }
+
+  friend class CondInstruction;
+
+ private:
+  std::shared_ptr<ValueExecutionInfo> NewChild(Scope* scope);
+
+  ValueExecutionInfo* parent_{nullptr};  // not owned
+
+  Scope* scope_{nullptr};  // not owned
+
+  std::unordered_map<::pir::Value, std::string> value_2_var_name_;
+
+  std::unordered_map<const paddle::framework::Variable*, std::string>
+      var_2_var_name_;
+
+  std::map<std::string, int> var_name_2_id_;
+
+  std::unordered_map<int, std::string> id_2_var_name_;
+
+  std::vector<Variable*> var_list_;
+};
+
+}  // namespace framework
+}  // namespace paddle
+
 namespace pir {
-void BuildScope(const pir::Block& block,
-                paddle::framework::Scope* inner_scope,
-                const std::string& var_name_prefix,
-                std::unordered_map<pir::Value, std::string>* value_2_var_name,
-                std::unordered_map<const paddle::framework::Variable*,
-                                   std::string>* variable_2_var_name,
-                std::map<std::string, int>* var_name_2_id,
-                std::vector<paddle::framework::Variable*>* variable_list,
-                std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks);
+void BuildScope(
+    const pir::Block& block,
+    const std::string& var_name_prefix,
+    std::map<pir::Block*, paddle::framework::Scope*>* sub_blocks,
+    paddle::framework::ValueExecutionInfo* value_exe_info = nullptr);
 
 void BuildRuntimeContext(
     pir::Operation* op,
diff --git a/paddle/fluid/pir/CMakeLists.txt b/paddle/fluid/pir/CMakeLists.txt
index 655c6c2c8d012..1ff77c6d7187e 100644
--- a/paddle/fluid/pir/CMakeLists.txt
+++ b/paddle/fluid/pir/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_subdirectory(dialect)
 add_subdirectory(transforms)
-add_subdirectory(phi_kernel_adaptor)
diff --git a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h
deleted file mode 100644
index c8a72797318c7..0000000000000
--- a/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
-#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
-#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
-#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/phi/core/meta_tensor.h"
-#include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
-#include "paddle/pir/core/builtin_attribute.h"
-#include "paddle/pir/core/builtin_dialect.h"
-#include "paddle/pir/core/builtin_op.h"
-#include "paddle/pir/core/ir_context.h"
-#include "paddle/pir/core/program.h"
-#include "paddle/pir/core/utils.h"
-
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/variable_helper.h"
-
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/kernel_context.h"
-#include "paddle/phi/core/kernel_factory.h"
-
-#include "paddle/fluid/platform/init.h"
-
-#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
-
-#include "glog/logging.h"
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
-
-class PhiKernelAdaptor {
- public:
-  explicit PhiKernelAdaptor(paddle::framework::Scope* scope) : scope_(scope) {}
-
-  void run_kernel_prog(pir::Program* program) {
-    auto block = program->block();
-    std::unordered_map<pir::Value, std::string> value_2_var_name;
-    std::unordered_map<const paddle::framework::Variable*, std::string>
-        variable_2_var_name;
-    std::map<std::string, int> var_name_2_id;
-    std::vector<paddle::framework::Variable*> variable_list;
-    std::map<pir::Block*, paddle::framework::Scope*> sub_blocks;
-    std::stringstream ss;
-    ss << this;
-
-    BuildScope(*block,
-               scope_,
-               ss.str(),
-               &value_2_var_name,
-               &variable_2_var_name,
-               &var_name_2_id,
-               &variable_list,
-               &sub_blocks);
-    pir::IrContext* ctx = pir::IrContext::Instance();
-
-    ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-
-    auto* dev_ctx = phi::DeviceContextPool::Instance().Get(phi::CPUPlace());
-    phi::Place cpu_place(phi::AllocationType::CPU);
-    for (auto it = block->begin(); it != block->end(); ++it) {
-      auto attr_map = (*it)->attributes();
-
-      auto op_name =
-          attr_map.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
-
-      pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(op_name);
-
-      auto impl =
-          op1_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
-      auto yaml_info = impl->get_op_info_();
-
-      auto attr_info = std::get<1>(yaml_info);
-
-      auto infer_meta_impl =
-          op1_info.GetInterfaceImpl<paddle::dialect::InferMetaInterface>();
-
-      phi::InferMetaContext ctx;
-
-      paddle::dialect::OpYamlInfoParser op_yaml_info_parser(yaml_info);
-      pir::BuildPhiContext<
-          phi::InferMetaContext,
-          phi::MetaTensor,
-          phi::MetaTensor,
-          paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
-          paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
-          false>(
-          (*it), value_2_var_name, scope_, nullptr, op_yaml_info_parser, &ctx);
-
-      infer_meta_impl->infer_meta_(&ctx);
-
-      auto kernel_name =
-          attr_map.at("kernel_name").dyn_cast<pir::StrAttribute>().AsString();
-      auto kernel_key = attr_map.at("kernel_key")
-                            .dyn_cast<paddle::dialect::KernelAttribute>()
-                            .data();
-
-      auto kernel_fn =
-          phi::KernelFactory::Instance().SelectKernel(kernel_name, kernel_key);
-
-      phi::KernelContext kernel_ctx(dev_ctx);
-
-      pir::BuildPhiContext<phi::KernelContext,
-                           const phi::TensorBase*,
-                           phi::TensorBase*,
-                           paddle::small_vector<const phi::TensorBase*>,
-                           paddle::small_vector<phi::TensorBase*>,
-                           true>((*it),
-                                 value_2_var_name,
-                                 scope_,
-                                 nullptr,
-                                 op_yaml_info_parser,
-                                 &kernel_ctx);
-      kernel_fn(&kernel_ctx);
-
-      auto out_value = (*it)->result(0);
-      out_name = value_2_var_name[out_value];
-    }
-  }
-
-  std::string out_name;
-
- private:
-  paddle::framework::Scope* scope_;
-};
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 1a919956a2c30..46bfb0ee005a4 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -38,7 +38,7 @@ set(PYBIND_DEPS
     global_utils
     phi_utils
     phi
-    phi_kernel_adaptor
+    pir_adaptor
     pd_op_dialect
     program_translator
     pd_inplace_pass
diff --git a/test/cpp/new_executor/CMakeLists.txt b/test/cpp/new_executor/CMakeLists.txt
index af09520b12a54..9fbd4a82feb9a 100644
--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
@@ -6,7 +6,6 @@ if(NOT WIN32)
     SRCS
     standalone_executor_new_ir_test.cc
     DEPS
-    phi_kernel_adaptor
     pd_op_dialect
     pd_kernel_dialect
     pir
diff --git a/test/cpp/pir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
index 355738d3baef5..0d65bc5b454c3 100644
--- a/test/cpp/pir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -38,20 +38,6 @@ cc_test_old(
   phi
   gtest)
 
-cc_test_old(
-  ir_exe_test
-  SRCS
-  ir_exe_test.cc
-  DEPS
-  pd_op_to_kernel_pass
-  program_translator
-  pd_op_dialect
-  pd_kernel_dialect
-  phi_kernel_adaptor
-  pir
-  phi
-  gtest)
-
 cc_test_old(
   scalar_attribute_test
   SRCS
diff --git a/test/cpp/pir/core/ir_exe_test.cc b/test/cpp/pir/core/ir_exe_test.cc
deleted file mode 100644
index 6ce9d39172a20..0000000000000
--- a/test/cpp/pir/core/ir_exe_test.cc
+++ /dev/null
@@ -1,206 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
-#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/phi/core/meta_tensor.h"
-#include "paddle/phi/infermeta/binary.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
-#include "paddle/pir/core/builtin_attribute.h"
-#include "paddle/pir/core/builtin_dialect.h"
-#include "paddle/pir/core/builtin_op.h"
-#include "paddle/pir/core/ir_context.h"
-#include "paddle/pir/core/program.h"
-#include "paddle/pir/core/utils.h"
-
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/variable_helper.h"
-
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/kernel_context.h"
-#include "paddle/phi/core/kernel_factory.h"
-
-#include "paddle/fluid/platform/init.h"
-
-#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
-
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h"
-#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/pir/core/attribute.h"
-
-PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(full_int_array, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(uniform, CPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
-
-bool simple_cmp(float a, float b) { return std::abs((a - b) / a) < 1e-5; }
-
-TEST(program_test, program) {
-  // Prepare ir env
-  pir::IrContext* ctx = pir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-  pir::Program program(ctx);
-  pir::Builder builder(ctx, program.block());
-  pir::Block* block = program.block();
-
-  // Def: A = paddle::dialect::UniformOp(std::vector<int64_t> shape,
-  // phi::DataType dtype, float min, float max, int seed, phi::Place place)
-  pir::AttributeMap uniform1_attributes;
-  uniform1_attributes.insert({"shape",
-                              paddle::dialect::IntArrayAttribute::get(
-                                  pir::IrContext::Instance(),
-                                  phi::IntArray(std::vector<int64_t>{2, 2}))});
-  uniform1_attributes.insert(
-      {"dtype",
-       paddle::dialect::DataTypeAttribute::get(pir::IrContext::Instance(),
-                                               phi::DataType::FLOAT32)});
-  uniform1_attributes.insert(
-      {"min", pir::FloatAttribute::get(pir::IrContext::Instance(), 0.0)});
-  uniform1_attributes.insert(
-      {"max", pir::FloatAttribute::get(pir::IrContext::Instance(), 1.0)});
-  uniform1_attributes.insert(
-      {"seed", pir::Int32Attribute::get(pir::IrContext::Instance(), 2)});
-  uniform1_attributes.insert(
-      {"place",
-       paddle::dialect::PlaceAttribute::get(pir::IrContext::Instance(),
-                                            phi::CPUPlace())});
-  paddle::dialect::UniformOp uniform1 =
-      builder.Build<paddle::dialect::UniformOp>(uniform1_attributes);
-
-  EXPECT_EQ(uniform1->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-            true);
-  EXPECT_EQ(block->size(), 4u);
-
-  pir::Attribute seed_attr = uniform1.attribute("seed");
-  pir::Int32Attribute seed_attr1 =
-      uniform1.attribute<pir::Int32Attribute>("seed");
-  EXPECT_EQ(seed_attr.dyn_cast<pir::Int32Attribute>().data(),
-            seed_attr1.data());
-
-  // Def: B = paddle::dialect::UniformOp(...)
-  paddle::dialect::UniformOp uniform2 =
-      builder.Build<paddle::dialect::UniformOp>(std::vector<int64_t>{2, 2},
-                                                phi::DataType::FLOAT32,
-                                                0.0,
-                                                1.0,
-                                                2,
-                                                phi::CPUPlace());
-  EXPECT_EQ(uniform2->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-            true);
-  EXPECT_EQ(block->size(), 8u);
-
-  // Def: C = paddle::dialect::AddOp(pir::OpResult x_, pir::OpResult y_)
-  paddle::dialect::AddOp add = builder.Build<paddle::dialect::AddOp>(
-      uniform1->result(0), uniform2->result(0));
-  EXPECT_EQ(add->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-            true);
-  EXPECT_EQ(block->size(), 9u);
-
-  // Execute program
-  auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program);
-  paddle::framework::Scope scope;
-  PhiKernelAdaptor phi_kernel_adaptor(&scope);
-  phi_kernel_adaptor.run_kernel_prog(kernel_program.get());
-
-  auto out_tensor =
-      scope.Var(phi_kernel_adaptor.out_name)->Get<phi::DenseTensor>();
-
-  bool res0 = simple_cmp(out_tensor.data<float>()[0], 1.80721);
-  bool res1 = simple_cmp(out_tensor.data<float>()[1], 1.70047);
-  bool res2 = simple_cmp(out_tensor.data<float>()[2], 1.56764);
-  bool res3 = simple_cmp(out_tensor.data<float>()[3], 1.85063);
-
-  EXPECT_EQ(res0, true);
-  EXPECT_EQ(res1, true);
-  EXPECT_EQ(res2, true);
-  EXPECT_EQ(res3, true);
-}
-
-TEST(program_test, mutable_attribute) {
-  // Prepare ir env
-  pir::IrContext* ctx = pir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-  pir::Program program(ctx);
-  pir::Builder builder = pir::Builder(ctx, program.block());
-  pir::Block* block = program.block();
-
-  // Def FullOp
-  paddle::dialect::FullIntArrayOp full_shape_op =
-      builder.Build<paddle::dialect::FullIntArrayOp>(
-          std::vector<int64_t>{2, 2}, phi::DataType::INT64, phi::CPUPlace());
-  pir::OpResult shape_ = full_shape_op->result(0);
-  // Generate scalar mutable attribute: min
-  paddle::dialect::FullOp full_min_op = builder.Build<paddle::dialect::FullOp>(
-      std::vector<int64_t>{1}, 0.0, phi::DataType::FLOAT32, phi::CPUPlace());
-  pir::OpResult min_ = full_min_op->result(0);
-  // Generate scalar mutable attribute: max
-  paddle::dialect::FullOp full_max_op = builder.Build<paddle::dialect::FullOp>(
-      std::vector<int64_t>{1}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
-  pir::OpResult max_ = full_max_op->result(0);
-
-  // Def: static void Build(pir::Builder &builder, pir::OperationArgument
-  // &argument, pir::OpResult shape_, pir::OpResult min_, pir::OpResult max_,
-  // phi::DataType dtype, int seed, phi::Place place={});
-  paddle::dialect::UniformOp uniform1 =
-      builder.Build<paddle::dialect::UniformOp>(
-          shape_, min_, max_, phi::DataType::FLOAT32, 2, phi::CPUPlace());
-  EXPECT_EQ(uniform1->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-            true);
-  EXPECT_EQ(block->size(), 4u);
-
-  // Def: B = paddle::dialect::UniformOp(...)
-  paddle::dialect::UniformOp uniform2 =
-      builder.Build<paddle::dialect::UniformOp>(
-          shape_, min_, max_, phi::DataType::FLOAT32, 2, phi::CPUPlace());
-  EXPECT_EQ(uniform2->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-            true);
-  EXPECT_EQ(block->size(), 5u);
-
-  // Def: C = paddle::dialect::AddOp(pir::OpResult x_, pir::OpResult y_)
-  paddle::dialect::AddOp add = builder.Build<paddle::dialect::AddOp>(
-      uniform1->result(0), uniform2->result(0));
-  EXPECT_EQ(add->result(0).type().isa<paddle::dialect::DenseTensorType>(),
-            true);
-  EXPECT_EQ(block->size(), 6u);
-
-  // Execute program
-  auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program);
-  paddle::framework::Scope scope;
-  PhiKernelAdaptor phi_kernel_adaptor(&scope);
-  phi_kernel_adaptor.run_kernel_prog(kernel_program.get());
-
-  auto out_tensor =
-      scope.Var(phi_kernel_adaptor.out_name)->Get<phi::DenseTensor>();
-
-  bool res0 = simple_cmp(out_tensor.data<float>()[0], 1.80721);
-  bool res1 = simple_cmp(out_tensor.data<float>()[1], 1.70047);
-  bool res2 = simple_cmp(out_tensor.data<float>()[2], 1.56764);
-  bool res3 = simple_cmp(out_tensor.data<float>()[3], 1.85063);
-  std::cerr << out_tensor.data<float>()[0] << "\t"
-            << out_tensor.data<float>()[1] << "\t"
-            << out_tensor.data<float>()[2] << "\t"
-            << out_tensor.data<float>()[3] << std::endl;
-  EXPECT_EQ(res0, true);
-  EXPECT_EQ(res1, true);
-  EXPECT_EQ(res2, true);
-  EXPECT_EQ(res3, true);
-}
diff --git a/test/cpp/pir/kernel_dialect/CMakeLists.txt b/test/cpp/pir/kernel_dialect/CMakeLists.txt
index ea8477dbe3970..4f68299ee0771 100644
--- a/test/cpp/pir/kernel_dialect/CMakeLists.txt
+++ b/test/cpp/pir/kernel_dialect/CMakeLists.txt
@@ -6,8 +6,8 @@ cc_test_old(
   pd_op_to_kernel_pass
   program_translator
   pd_kernel_dialect
-  phi_kernel_adaptor
   pd_trait
   pir
   phi
+  standalone_executor
   gtest)
diff --git a/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
index 1c0eaebe0a909..501cfb0897c1a 100644
--- a/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
+++ b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
@@ -15,10 +15,12 @@
 #include <gtest/gtest.h>
 #include <sstream>
 
+#include "build/paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
@@ -26,7 +28,6 @@
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h"
 #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/common/data_type.h"
@@ -53,61 +54,6 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 
 bool simple_cmp(float a, float b) { return std::abs((a - b) / a) < 1e-5; }
 
-TEST(program_test, program) {
-  // (1) Init environment.
-  pir::IrContext* ctx = pir::IrContext::Instance();
-  pir::Program program((ctx));
-
-  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
-
-  pir::Builder builder = pir::Builder(ctx, program.block());
-
-  paddle::dialect::FullOp op1 = builder.Build<paddle::dialect::FullOp>(
-      std::vector<int64_t>{2, 2}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
-
-  paddle::dialect::FullOp op2 = builder.Build<paddle::dialect::FullOp>(
-      std::vector<int64_t>{2, 2}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
-
-  builder.Build<paddle::dialect::AddOp>(op1->result(0), op2->result(0));
-
-  auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(&program);
-
-  paddle::framework::Scope scope;
-  PhiKernelAdaptor phi_kernel_adaptor(&scope);
-  phi_kernel_adaptor.run_kernel_prog(kernel_program.get());
-
-  auto out_tensor =
-      scope.Var(phi_kernel_adaptor.out_name)->Get<phi::DenseTensor>();
-
-  bool res0 = simple_cmp(out_tensor.data<float>()[0], 2.0);
-  bool res1 = simple_cmp(out_tensor.data<float>()[1], 2.0);
-  bool res2 = simple_cmp(out_tensor.data<float>()[2], 2.0);
-  bool res3 = simple_cmp(out_tensor.data<float>()[3], 2.0);
-
-  EXPECT_EQ(res0, true);
-  EXPECT_EQ(res1, true);
-  EXPECT_EQ(res2, true);
-  EXPECT_EQ(res3, true);
-
-  EXPECT_EQ(kernel_program->block()->size(), 3u);
-  EXPECT_EQ(kernel_program->block()
-                ->front()
-                ->dyn_cast<paddle::dialect::PhiKernelOp>()
-                .op_name(),
-            "pd_op.full");
-  EXPECT_EQ(kernel_program->block()
-                ->front()
-                ->dyn_cast<paddle::dialect::PhiKernelOp>()
-                .kernel_name(),
-            "full");
-  EXPECT_EQ(kernel_program->block()
-                ->front()
-                ->dyn_cast<paddle::dialect::PhiKernelOp>()
-                .kernel_key()
-                .dtype(),
-            phi::DataType::FLOAT32);
-}
-
 TEST(dialect_attr, attr) {
   // (1) Init environment.
   pir::IrContext* ctx = pir::IrContext::Instance();
diff --git a/test/cpp/prim/CMakeLists.txt b/test/cpp/prim/CMakeLists.txt
index efe5f0a635597..2c0b451437dad 100644
--- a/test/cpp/prim/CMakeLists.txt
+++ b/test/cpp/prim/CMakeLists.txt
@@ -68,5 +68,5 @@ if(NOT WIN32)
   cc_test(
     test_vjp_new_ir
     SRCS test_vjp.cc
-    DEPS phi_kernel_adaptor pd_op_dialect pir)
+    DEPS pir_adaptor pd_op_dialect pir)
 endif()

From 0a25b1ac5933338449cdca1cfd1502f17b7afa48 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Mon, 25 Sep 2023 16:21:33 +0800
Subject: [PATCH 105/115] [NewIR] No.9 Migrate rms_norm into pir (#57156)

* [NewIR] No.9 Migrate rms_norm into pir

* update test

* update

* update

* fix bug
---
 .../incubate/nn/functional/fused_rms_norm.py  |  4 +-
 test/legacy_test/test_rms_norm_op.py          | 43 +++++++++++++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/python/paddle/incubate/nn/functional/fused_rms_norm.py b/python/paddle/incubate/nn/functional/fused_rms_norm.py
index 54c6e1dfba021..3995cd4a4087d 100644
--- a/python/paddle/incubate/nn/functional/fused_rms_norm.py
+++ b/python/paddle/incubate/nn/functional/fused_rms_norm.py
@@ -15,7 +15,7 @@
 
 import paddle
 from paddle import _C_ops
-from paddle.framework import LayerHelper, in_dynamic_mode
+from paddle.framework import LayerHelper, in_dynamic_or_pir_mode
 
 
 def fused_rms_norm(
@@ -63,7 +63,7 @@ def fused_rms_norm(
             epsilon = 1e-6
             paddle_rmsnorm = paddle.incubate.nn.functional.fused_rms_norm(paddle_x, paddle_weight, paddle_bias, epsilon, 1)
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.rms_norm(
             x,
             bias,
diff --git a/test/legacy_test/test_rms_norm_op.py b/test/legacy_test/test_rms_norm_op.py
index cd9fa001e8362..79e20e906d92c 100644
--- a/test/legacy_test/test_rms_norm_op.py
+++ b/test/legacy_test/test_rms_norm_op.py
@@ -342,6 +342,49 @@ def check_rmsnorm(self, x_np, gamma_np, beta_np, dtype):
             )
         return out_s[0], paddle_naive_rmsnorm_out
 
+    def test_rmsnorm_pir(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x_np.astype("float32"))
+        gamma = paddle.to_tensor(self.norm_weight_np.astype("float32"))
+        beta = paddle.to_tensor(self.norm_bias_np.astype("float32"))
+
+        paddle_naive_rmsnorm_out = naive_rms_norm(x, gamma, beta, self.epsilon)
+        paddle.enable_static()
+
+        with paddle.pir_utils.IrGuard():
+            x_static = paddle.static.data(
+                name="x_static", shape=[self.batch, self.cols], dtype="float32"
+            )
+            gamma_static = paddle.static.data(
+                name="gamma_static", shape=[self.cols], dtype="float32"
+            )
+            beta_static = paddle.static.data(
+                name="beta_static", shape=[self.cols], dtype="float32"
+            )
+            out, _ = paddle.incubate.nn.functional.fused_rms_norm(
+                x_static,
+                gamma_static,
+                beta_static,
+                self.epsilon,
+                begin_norm_axis=1,
+            )
+            exe = base.Executor(self.place)
+            out_s = exe.run(
+                feed={
+                    "x_static": self.x_np.astype("float32"),
+                    "gamma_static": self.norm_weight_np.astype("float32"),
+                    "beta_static": self.norm_bias_np.astype("float32"),
+                },
+                fetch_list=[out],
+            )
+
+        np.testing.assert_allclose(
+            out_s[0],
+            paddle_naive_rmsnorm_out.numpy(),
+            rtol=1e-3,
+            atol=1e-3,
+        )
+
     def check_rmsnorm_int8(self, x_np, gamma_np, beta_np, dtype):
         paddle.disable_static()
         x = paddle.to_tensor(x_np.astype(dtype))

From 7bc7fad073bde7f1bf4146ad3622c4a44ffa5251 Mon Sep 17 00:00:00 2001
From: Lu Qi <61354321+MarioLulab@users.noreply.github.com>
Date: Mon, 25 Sep 2023 16:32:21 +0800
Subject: [PATCH 106/115] [GLCC]Part-4: fix testcases for dy2st PyLayer
 (#57633)

* add testcase load_inference_model

* polish code for last pr

* delete annots

* fix docs of static_pylayer
---
 python/paddle/static/nn/static_pylayer.py     | 12 +--
 test/dygraph_to_static/test_pylayer.py        |  2 +-
 .../test_save_inference_model.py              | 91 +++++++++++++++++--
 test/legacy_test/test_prune.py                |  8 +-
 4 files changed, 96 insertions(+), 17 deletions(-)

diff --git a/python/paddle/static/nn/static_pylayer.py b/python/paddle/static/nn/static_pylayer.py
index 3dcf35e50e54b..1fd0332faa1d3 100644
--- a/python/paddle/static/nn/static_pylayer.py
+++ b/python/paddle/static/nn/static_pylayer.py
@@ -242,14 +242,14 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
     the logic of ``forward_fn`` and ``backward_fn``, with the operator ``pylayer``
     holding information about the two blocks.
 
-    ``forward_fn`` and ``backward_fn`` should return a nest structure of tensors.
-    A nest structure of tensors in PaddlePaddle is tensor(s), or tuple of tensors, or
-    list of tensors.
+    ``forward_fn`` and ``backward_fn`` should return a nest structure of Variables.
+    A nest structure of Variables in PaddlePaddle is Variable(s), or tuple of Variables, or
+    list of Variables.
 
     Note:
-        1. If ``backward_fn`` is not None, user needs to keep the number of inputs to ``forward_fn`` the same as the
-        number of outputs to ``backward_fn``, and the number of outputs to ``forward_fn``
-        the same as the number of inputs to ``backward_fn``.
+        1. If ``backward_fn`` is not None, user needs to keep the number of `Variable` inputs to ``forward_fn`` the same as the
+        number of `Variable` outputs to ``backward_fn``, and the number of `Variable` outputs to ``forward_fn``
+        the same as the number of `Variable` inputs to ``backward_fn``.
 
         2. If ``backward_fn`` is None, ``stop_gradient`` attr of all Variable in ``inputs`` is expected to be True.
         Otherwise it might get unexpected results in backward propagation.
diff --git a/test/dygraph_to_static/test_pylayer.py b/test/dygraph_to_static/test_pylayer.py
index fca9534c2cd30..c36bc1a14d5d1 100644
--- a/test/dygraph_to_static/test_pylayer.py
+++ b/test/dygraph_to_static/test_pylayer.py
@@ -243,7 +243,7 @@ def __init__(self, in_size, out_size):
     def forward(self, x1, x2):
         y1 = self.linear1(x1)
         y2 = self.linear1(x2)
-        out = cus_tanh_2.apply(y1, func1=paddle.tanh)
+        out = cus_tanh_2.apply(y1, paddle.tanh)
         out = out + y2
         out = paddle.mean(out)
         return out
diff --git a/test/dygraph_to_static/test_save_inference_model.py b/test/dygraph_to_static/test_save_inference_model.py
index 5561c4072e2c4..c6a01d38e7d86 100644
--- a/test/dygraph_to_static/test_save_inference_model.py
+++ b/test/dygraph_to_static/test_save_inference_model.py
@@ -21,6 +21,7 @@
 
 import paddle
 from paddle import base
+from paddle.autograd import PyLayer
 from paddle.jit.api import to_static
 from paddle.jit.dy2static.partial_program import partial_program_from
 from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -45,6 +46,33 @@ def forward(self, x):
         return out, y
 
 
+class cus_tanh(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        y = paddle.tanh(x)
+        ctx.save_for_backward(y)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        (y,) = ctx.saved_tensor()
+        grad = dy * (1 - paddle.square(y))
+        return grad
+
+
+class SimplePyLayerNet(paddle.nn.Layer):
+    def __init__(self, fc_size):
+        super().__init__()
+        self._linear = paddle.nn.Linear(fc_size, fc_size)
+
+    @to_static
+    def forward(self, x):
+        y = self._linear(x)
+        out = cus_tanh.apply(y)
+        loss = paddle.mean(out)
+        return loss, out
+
+
 class TestDyToStaticSaveInferenceModel(unittest.TestCase):
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
@@ -94,8 +122,52 @@ def test_save_inference_model(self):
             layer, [x_data], dygraph_out.numpy(), feed=[x]
         )
 
+    @ast_only_test
+    def test_save_pylayer_model(self):
+        fc_size = 20
+        x_data = np.random.random((fc_size, fc_size)).astype('float32')
+        paddle.base.framework._set_expected_place(place)
+
+        base.default_startup_program().random_seed = SEED
+        base.default_main_program().random_seed = SEED
+        paddle.disable_static()
+        x = base.dygraph.to_variable(x_data)
+        layer = SimplePyLayerNet(fc_size)
+        adam = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=layer.parameters()
+        )
+
+        for i in range(5):
+            loss, pred = layer(x)
+            loss.backward()
+            adam.minimize(loss)
+            layer.clear_gradients()
+
+        infer_model_prefix = os.path.join(
+            self.temp_dir.name, "test_dy2stat_inference_in_guard/model_pylayer"
+        )
+        paddle.jit.save(
+            layer=layer,
+            path=infer_model_prefix,
+            input_spec=[x],
+            output_spec=[pred],
+        )
+        # Check the correctness of the inference
+        loss_out, _ = layer(x)
+
+        loss_out_numpy = float(loss_out)
+        self.check_save_inference_model(
+            layer, [x_data], loss_out_numpy, enable_new_ir=False
+        )
+        self.check_save_inference_model(
+            layer, [x_data], loss_out_numpy, fetch=[loss], enable_new_ir=False
+        )
+        self.check_save_inference_model(
+            layer, [x_data], loss_out_numpy, feed=[x], enable_new_ir=False
+        )
+
     def check_save_inference_model(
-        self, model, inputs, gt_out, feed=None, fetch=None
+        self, model, inputs, gt_out, feed=None, fetch=None, enable_new_ir=True
     ):
         expected_persistable_vars = {p.name for p in model.parameters()}
 
@@ -113,13 +185,20 @@ def check_save_inference_model(
             input_spec=feed if feed else None,
             output_spec=fetch if fetch else None,
         )
-        # Check the correctness of the inference
-        infer_out = self.load_and_run_inference(
-            infer_model_dir, model_filename, params_filename, inputs
-        )
+        if enable_new_ir:
+            wrapped_load_and_run_inference = test_and_compare_with_new_ir(True)(
+                self.load_and_run_inference
+            )
+            infer_out = wrapped_load_and_run_inference(
+                infer_model_dir, model_filename, params_filename, inputs
+            )
+        else:
+            infer_out = self.load_and_run_inference(
+                infer_model_dir, model_filename, params_filename, inputs
+            )
+
         np.testing.assert_allclose(gt_out, infer_out, rtol=1e-05)
 
-    @test_and_compare_with_new_ir(True)
     def load_and_run_inference(
         self, model_path, model_filename, params_filename, inputs
     ):
diff --git a/test/legacy_test/test_prune.py b/test/legacy_test/test_prune.py
index 91314d3c86b80..f82a4d4331b09 100644
--- a/test/legacy_test/test_prune.py
+++ b/test/legacy_test/test_prune.py
@@ -211,7 +211,7 @@ def _mock_guard(mock):
     base.Executor._prune_program = original
 
 
-def net1():
+def create_net1():
     x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
     x.desc.set_need_check_feed(False)
     label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
@@ -238,7 +238,7 @@ def net1():
     return x, y, label, loss1, loss2, w_param_attrs
 
 
-def net2():
+def create_net2():
     x1 = paddle.static.data(name='x1', shape=[-1, 2], dtype='float32')
     x1.desc.set_need_check_feed(False)
     x2 = paddle.static.data(name='x2', shape=[-1, 2], dtype='float32')
@@ -286,8 +286,8 @@ def net2():
 
 class TestExecutorRunAutoPrune(unittest.TestCase):
     def setUp(self):
-        self.net1 = net1
-        self.net2 = net2
+        self.net1 = create_net1
+        self.net2 = create_net2
 
     def test_not_prune(self):
         """

From 76fb47cde8dcabb101c98821a9dfebae983997e3 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Mon, 25 Sep 2023 16:38:15 +0800
Subject: [PATCH 107/115] fix brpc flag declare; test=develop (#57688)

---
 .../distributed/ps/service/simple_rpc/baidu_rpc_server.cc     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/distributed/ps/service/simple_rpc/baidu_rpc_server.cc b/paddle/fluid/distributed/ps/service/simple_rpc/baidu_rpc_server.cc
index a10e78fe94162..f3e501dd00ce1 100644
--- a/paddle/fluid/distributed/ps/service/simple_rpc/baidu_rpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/simple_rpc/baidu_rpc_server.cc
@@ -19,8 +19,8 @@
 #include "paddle/phi/core/enforce.h"
 
 namespace brpc {
-PD_DECLARE_uint64(max_body_size);
-PD_DECLARE_int64(socket_max_unwritten_bytes);
+DECLARE_uint64(max_body_size);
+DECLARE_int64(socket_max_unwritten_bytes);
 }  // namespace brpc
 
 namespace paddle {

From 27476e78dc7fa285c89c90064757bd3748cec031 Mon Sep 17 00:00:00 2001
From: yuchen202 <103028470+yuchen202@users.noreply.github.com>
Date: Mon, 25 Sep 2023 18:05:09 +0800
Subject: [PATCH 108/115] [Docathon] Fix add API Docs (#57434)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 0918API

* Apply suggestions from code review

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>

* 0923修改

---------

Co-authored-by: zachary sun <70642955+sunzhongkai588@users.noreply.github.com>
---
 python/paddle/nn/layer/conv.py        | 5 ++++-
 python/paddle/static/input.py         | 1 +
 python/paddle/tensor/manipulation.py  | 6 +-----
 python/paddle/tensor/math.py          | 7 +------
 python/paddle/tensor/random.py        | 2 +-
 python/paddle/vision/models/resnet.py | 2 +-
 6 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index e0ab183f51bcb..5b6d295458cfa 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -759,7 +759,10 @@ class Conv2DTranspose(_ConvNd):
     * :math:`\ast`: Convolution operation.
     * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    * :math:`Out`: Output value, a 4-D ``Tensor`` with NCHW or NHWC format, the shape of :math:`Out` and :math:`X` may be different.
+
+    Note:
+     If output_size is None, :math:`H_{out}` = :math:`H^\prime_{out}` , :math:`W_{out}` = :math:`W^\prime_{out}`. Otherwise, the specified output_size_height (the height of the output feature layer) :math:`H_{out}` should be between :math:`H^\prime_{out}` and :math:`H^\prime_{out} + strides[0]` (excluding :math:`H^\prime_{out} + strides[0]` ).
 
     Parameters:
         in_channels(int): The number of channels in the input image.
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index 412defb7d6ea2..518fc8d6519cd 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -180,6 +180,7 @@ class InputSpec:
             uint8. Default: float32.
         name (str): The name/alias of the variable, see :ref:`api_guide_Name`
             for more details.
+        stop_gradient (bool, optional): A boolean that mentions whether gradient should flow. Default is False, means don't stop calculate gradients.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 46849cbde953d..db54587658137 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -2896,11 +2896,7 @@ def scatter(x, index, updates, overwrite=True, name=None):
         x (Tensor): The input N-D Tensor with ndim>=1. Data type can be float32, float64.
         index (Tensor): The index is a 1-D or 0-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
         updates (Tensor): Update input with updates parameter based on index. When the index is a 1-D tensor, the updates shape should be the same as input, and dim value with dim > 1 should be the same as input. When the index is a 0-D tensor, the updates should be a (N-1)-D tensor, the ith dim of the updates should be queal with the (i+1)th dim of the input.
-        overwrite (bool, optional): The mode that updating the output when there are same indices.
-
-            If True, use the overwrite mode to update the output of the same index,
-            if False, use the accumulate mode to update the output of the same index. Default value is True.
-
+        overwrite (bool, optional): The mode that updating the output when there are same indices.If True, use the overwrite mode to update the output of the same index,if False, use the accumulate mode to update the output of the same index. Default value is True.
         name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index fd11afc85935c..9a4bb3cc9a4d2 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -643,16 +643,11 @@ def add(x, y, name=None):
     $X$ the tensor of any dimension.
     $Y$ the tensor whose dimensions must be less than or equal to the dimensions of $X$.
 
-    There are two cases for this operator:
+    This operator is used in the following cases:
 
     1. The shape of $Y$ is the same with $X$.
     2. The shape of $Y$ is a continuous subsequence of $X$.
 
-    For case 2:
-
-    1. Broadcast $Y$ to match the shape of $X$, where axis is the start dimension index for broadcasting $Y$ onto $X$.
-    2. If $axis$ is -1 (default), $axis$=rank($X$)-rank($Y$).
-    3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of subsequence, such as shape($Y$) = (2, 1) => (2).
 
         For example:
 
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 9333458844091..e03ab557f4253 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -1281,7 +1281,7 @@ def rand(shape, dtype=None, name=None):
             If ``shape`` is an Tensor, it should be an 1-D Tensor which represents a list.
         dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
-            Default is None, use global default dtype (see ``get_default_dtype``
+            Default is None, use global default dtype (see :ref:`get_default_dtype`
             for details).
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index d91cd66f04f5d..cb270154dd849 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -438,7 +438,7 @@ def resnet50(pretrained=False, **kwargs):
     Args:
         pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
                             on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
+        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_models_ResNet>`.
 
     Returns:
         :ref:`api_paddle_nn_Layer`. An instance of ResNet 50-layer model.

From 589f0f2f4a9ee865cb985ac6aef3d54b18690c12 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Mon, 25 Sep 2023 19:02:58 +0800
Subject: [PATCH 109/115] Add paddle_build.sh Check;test=document_fix (#57702)

---
 paddle/scripts/paddle_build.sh     | 2 +-
 tools/check_file_diff_approvals.sh | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index c72f20a24ff53..0563fb7beacfa 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2740,7 +2740,7 @@ function enable_unused_var_check() {
     export FLAGS_enable_unused_var_check=1
 }
 function check_coverage_added_ut() {
-    # NOTE(risemeup1):The steps of checkinge added test can be placed on the cpu machine to save gpu resources
+    # NOTE(risemeup1):The step of checking added test can be placed on the cpu machine to save gpu resources
     bash $PADDLE_ROOT/tools/check_added_ut.sh
 }
 function gen_doc_lib() {
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index d309a50d921ed..37d75207cfb84 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -89,6 +89,7 @@ API_FILES=("CMakeLists.txt"
            "python/paddle/incubate/autograd/primitives.py"
            "python/paddle/autograd/ir_backward.py"
            "python/paddle/autograd/backward_utils.py"
+           "paddle/scripts/paddle_build.sh"
            )
 
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
@@ -223,6 +224,9 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "python/paddle/autograd/ir_backward.py" ] || [ "${API_FILE}" == "python/paddle/autograd/backward_utils.py" ]; then
             echo_line="You must be approved by Aurelius84(zhangliujie) or cxxly(chenxiaoxu) or xiaoguoguo626807(wangruting) or changeyoung98(chenzhiyang) for python/paddle/autograd/ir_backward.py or python/paddle/autograd/backward_utils.py changes.\n"
             check_approval 1 Aurelius84 cxxly xiaoguoguo626807 changeyoung98
+      elif [ "${API_FILE}" == "paddle/scripts/paddle_build.sh" ]; then 
+	      echo_line="You must have one RD (tianshuo78520a or risemeup1 or zhangbo9674 or XieYunshen) for ${API_FILE} changes, which manages the Paddle CI on Linux.\n " 
+            check_approval 1 tianshuo78520a risemeup1 zhangbo9674 XieYunshen 
       else
           echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1,qili93,Aurelius84) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
           check_approval 1 XiaoguangHu01 chenwhql zhiqiu Xreki luotao1 qili93 Aurelius84

From 9c3bffb936e7b66e229aef5b018fe83c3c1851ba Mon Sep 17 00:00:00 2001
From: caozhou <48191911+Caozhou1995@users.noreply.github.com>
Date: Mon, 25 Sep 2023 20:20:51 +0800
Subject: [PATCH 110/115] [Auto Parallel]Update AutoTuner (#56939)

* add vpp search

* support cmd json or yaml

* add auto tuner log

* add OOM prune

* support sft and lora best cfg

* support sp

* fix sft/lora cfg

* fix json/yaml bug and update multi nodes status

* add pp and mp prune flag

* update log and csv path

* merge dev

* speed prune

* add search stage and run best stage cmd

* fix recompute_granularity prune bug

* fix get best cfg in sft and lora mode bug

* fix single card bug

* update read metric
---
 python/paddle/distributed/auto_tuner/prune.py |  86 ++-
 .../paddle/distributed/auto_tuner/recorder.py |  26 +-
 python/paddle/distributed/auto_tuner/utils.py | 533 +++++++++++++-----
 python/paddle/distributed/launch/main.py      | 120 +++-
 test/auto_parallel/test_auto_tuner.py         |   2 +-
 test/auto_parallel/test_auto_tuner_compare.py |   2 +-
 6 files changed, 598 insertions(+), 171 deletions(-)

diff --git a/python/paddle/distributed/auto_tuner/prune.py b/python/paddle/distributed/auto_tuner/prune.py
index 66f16ff67fb9d..abae3f606fee1 100644
--- a/python/paddle/distributed/auto_tuner/prune.py
+++ b/python/paddle/distributed/auto_tuner/prune.py
@@ -55,6 +55,11 @@ def prune_by_mp(tuner_cfg, cur_cfg, history_cfgs=None):
     mp_degree = cur_cfg.get("mp_degree", None)
     hidden_size = tuner_cfg["model_cfg"].get("hidden_size", None)
     vocab_size = tuner_cfg["model_cfg"].get("vocab_size", None)
+    num_attention_heads = tuner_cfg["model_cfg"].get(
+        "num_attention_heads", None
+    )
+    seq_length = tuner_cfg["model_cfg"].get("seq_length", None)
+    use_sequence_paralel = tuner_cfg.get("use_sequence_paralel", False)
 
     if mp_degree is None:
         return False
@@ -65,6 +70,12 @@ def prune_by_mp(tuner_cfg, cur_cfg, history_cfgs=None):
     if vocab_size and vocab_size % mp_degree != 0:
         return True
 
+    if num_attention_heads and num_attention_heads % mp_degree != 0:
+        return True
+
+    if seq_length and seq_length % mp_degree != 0 and use_sequence_paralel:
+        return True
+
     mp_degree_candidates = tuner_cfg.get("mp_degree", None)
 
     if mp_degree_candidates == "auto":
@@ -112,6 +123,50 @@ def prune_by_pp(tuner_cfg, cur_cfg, history_cfgs=None):
     return False
 
 
+@register_prune
+def prune_by_vpp(tuner_cfg, cur_cfg, history_cfgs=None):
+    """
+    Prune by vpp (virtual pipeline parallelism), the rules are:
+    1. VPP degree should be evenly divided by number of layers.
+    2. VPP degree should be in the candidates of user defined.
+    """
+    pp_degree = cur_cfg.get("pp_degree", None)
+    vpp_degree = cur_cfg.get("vpp_degree", None)
+    num_layers = tuner_cfg["model_cfg"].get("num_layers", None)
+
+    if pp_degree is None:
+        return False
+
+    if vpp_degree is None:
+        return False
+
+    if num_layers:
+        if num_layers % (pp_degree * vpp_degree) != 0:
+            return True
+        if pp_degree == 1 and vpp_degree != 1:
+            return True
+        if pp_degree <= 2 and vpp_degree != 1:
+            return True
+
+    vpp_degree_candidates = tuner_cfg.get("vpp_degree", None)
+    if vpp_degree_candidates == "auto":
+        vpp_degree_candidates = tuner_cfg["candidates"]["vpp_degree"]
+    if vpp_degree_candidates:
+        if vpp_degree not in vpp_degree_candidates:
+            return True
+
+    cfgs = same_cfgs_beside("vpp_degree", cur_cfg, history_cfgs)
+    if cfgs:
+        for cfg in cfgs:
+            # memory prune
+            if (
+                cfg["vpp_degree"] > vpp_degree
+                and cfg.get("max_mem_usage") == "OOM"
+            ):
+                return True
+    return False
+
+
 @register_prune
 def prune_by_mbs(tuner_cfg, cur_cfg, history_cfgs=None):
     """
@@ -144,6 +199,13 @@ def prune_by_mbs(tuner_cfg, cur_cfg, history_cfgs=None):
     if local_batch_size:
         if local_batch_size % micro_batch_size != 0:
             return True
+        acc_steps = local_batch_size // micro_batch_size
+        vpp_degree = cur_cfg.get("vpp_degree", None)
+        if vpp_degree is not None and vpp_degree > 1:
+            pp_degree = cur_cfg.get("pp_degree", None)
+            if pp_degree is not None:
+                if acc_steps % pp_degree != 0:
+                    return True
 
     if mbs_candidates:
         if micro_batch_size not in mbs_candidates:
@@ -158,6 +220,13 @@ def prune_by_mbs(tuner_cfg, cur_cfg, history_cfgs=None):
             ):
                 return True
 
+            # memory prune
+            if (
+                cfg["micro_batch_size"] < micro_batch_size
+                and cfg.get("max_mem_usage") == "OOM"
+            ):
+                return True
+
     return False
 
 
@@ -208,6 +277,13 @@ def prune_by_sharding(tuner_cfg, cur_cfg, history_cfgs):
             ):
                 return True
 
+            # memory prune
+            if (
+                cfg["sharding_stage"] > sharding_stage
+                and cfg.get("max_mem_usage") == "OOM"
+            ):
+                return True
+
     if sharding_degree == 1:
         cfgs = same_cfgs_beside("sharding_stage", cur_cfg, history_cfgs)
         if cfgs:
@@ -245,9 +321,6 @@ def prune_by_recompute(tuner_cfg, cur_cfg, history_cfgs):
         if recompute_granularity not in recompute_granularity_candidates:
             return True
 
-    if not use_recompute and recompute_granularity:
-        return True
-
     cfgs = same_cfgs_beside("use_recompute", cur_cfg, history_cfgs)
     if cfgs:
         for cfg in cfgs:
@@ -258,6 +331,13 @@ def prune_by_recompute(tuner_cfg, cur_cfg, history_cfgs):
             ):
                 return True
 
+            if (
+                cfg["use_recompute"]
+                and not use_recompute
+                and cfg.get("max_mem_usage") == "OOM"
+            ):
+                return True
+
     if not use_recompute:
         cfgs = same_cfgs_beside("recompute_granularity", cur_cfg, history_cfgs)
         if cfgs:
diff --git a/python/paddle/distributed/auto_tuner/recorder.py b/python/paddle/distributed/auto_tuner/recorder.py
index 296f29ec25ffb..71c1b08ff3ecd 100644
--- a/python/paddle/distributed/auto_tuner/recorder.py
+++ b/python/paddle/distributed/auto_tuner/recorder.py
@@ -47,10 +47,34 @@ def sort_metric(self, direction, metric_name) -> None:
                 reverse=False,
             )
 
-    def get_best(self, metric, direction) -> Tuple[dict, bool]:
+    def get_best(self, metric, direction, mode=None) -> Tuple[dict, bool]:
         self.sort_metric(direction=direction, metric_name=metric)
         if len(self.history) == 0:
             return (self.history[0], True)
+        if mode == "SFT" or mode == "LoRA":
+            best_cfg = self.history[0]
+            if (
+                isinstance(best_cfg["max_mem_usage"], str)
+                or best_cfg["time"] == -1
+            ):
+                return (best_cfg, True)
+            first_few = 1
+            for cfg in self.history:
+                if (
+                    not isinstance(cfg["max_mem_usage"], str)
+                    and cfg["max_mem_usage"] < best_cfg["max_mem_usage"]
+                    and cfg["time"] != -1
+                ):
+                    best_cfg = cfg
+                first_few += 1
+                if first_few >= 5:
+                    break
+            return (best_cfg, False)
+        if (
+            isinstance(self.history[0]["max_mem_usage"], str)
+            or self.history[0]["time"] == -1
+        ):
+            return (self.history[0], True)
         return (self.history[0], False)
 
     def store_history(self, path="./history.csv"):
diff --git a/python/paddle/distributed/auto_tuner/utils.py b/python/paddle/distributed/auto_tuner/utils.py
index 43ac4bddf48e6..3f2dcf45fcd85 100644
--- a/python/paddle/distributed/auto_tuner/utils.py
+++ b/python/paddle/distributed/auto_tuner/utils.py
@@ -22,6 +22,8 @@
 
 def divisor(num, reverse=False):
     """Return the divisor of the given number."""
+    if num == 1:
+        return [num]
     results = set()
     i = 1
     mid = num // 2 + 1
@@ -34,26 +36,76 @@ def divisor(num, reverse=False):
     return sorted(results, reverse=reverse)
 
 
-def dist_degree(mode, num_gpus, num_nodes):
+def dist_degree(mode, num_gpus, num_nodes, tuner_cfg=None):
     """Return the degree of different parallel modes by gpus and nodes num."""
-    assert mode in ["dp", "mp", "pp", "sharding"]
+    assert mode in ["dp", "mp", "pp", "sharding", "mbs", "vpp"]
     results = []
+    prune_results = []
     if mode == "dp":
         results = divisor(num_gpus, reverse=False)
 
     elif mode == "pp":
-        if num_nodes > 1:
-            results = list(range(1, num_nodes + 1))
+        if num_nodes > 1 and tuner_cfg.get("enable_pp_prune", True):
+            results = list(range(num_nodes + 1, 0, -1))
         else:
             results = divisor(num_gpus, reverse=True)
+        for pp_degree in results:
+            prune_flag = False
+            num_layers = tuner_cfg["model_cfg"].get("num_layers", None)
+
+            if num_layers:
+                if num_layers % pp_degree != 0:
+                    prune_flag = True
+
+            if not prune_flag:
+                prune_results.append(pp_degree)
+        results = prune_results
 
     elif mode == "mp":
-        gpus_per_node = num_gpus // num_nodes
-        results = divisor(gpus_per_node, reverse=True)
+        if tuner_cfg.get("enable_mp_prune", True):
+            gpus_per_node = num_gpus // num_nodes
+            results = divisor(gpus_per_node, reverse=True)
+        else:
+            results = divisor(num_gpus, reverse=True)
+        for mp_degree in results:
+            prune_flag = False
+            hidden_size = tuner_cfg["model_cfg"].get("hidden_size", None)
+            vocab_size = tuner_cfg["model_cfg"].get("vocab_size", None)
+            num_attention_heads = tuner_cfg["model_cfg"].get(
+                "num_attention_heads", None
+            )
+            seq_length = tuner_cfg["model_cfg"].get("seq_length", None)
+            use_sequence_paralel = tuner_cfg.get("use_sequence_paralel", False)
+
+            if hidden_size and hidden_size % mp_degree != 0:
+                prune_flag = True
+
+            if vocab_size and vocab_size % mp_degree != 0:
+                prune_flag = True
+
+            if num_attention_heads and num_attention_heads % mp_degree != 0:
+                prune_flag = True
+
+            if (
+                seq_length
+                and seq_length % mp_degree != 0
+                and use_sequence_paralel
+            ):
+                prune_flag = True
+
+            if not prune_flag:
+                prune_results.append(mp_degree)
+        results = prune_results
 
     elif mode == "sharding":
         results = divisor(num_gpus, reverse=True)
 
+    elif mode == "mbs":
+        results = divisor(tuner_cfg["model_cfg"]["global_batch_size"])
+
+    elif mode == "vpp":
+        results = divisor(tuner_cfg["model_cfg"]["num_layers"], reverse=True)
+
     return results
 
 
@@ -65,29 +117,44 @@ def default_candidates(tuner_cfg):
     assert num_gpus > 0
 
     if tuner_cfg.get("dp_degree", None) == "auto":
-        candidates["dp_degree"] = dist_degree("dp", num_gpus, num_nodes)
+        candidates["dp_degree"] = dist_degree(
+            "dp", num_gpus, num_nodes, tuner_cfg
+        )
     elif tuner_cfg.get("dp_degree", None):
         candidates["dp_degree"] = tuner_cfg.get("dp_degree")
     else:
         candidates["dp_degree"] = [1]
 
     if tuner_cfg.get("mp_degree", None) == "auto":
-        candidates["mp_degree"] = dist_degree("mp", num_gpus, num_nodes)
+        candidates["mp_degree"] = dist_degree(
+            "mp", num_gpus, num_nodes, tuner_cfg
+        )
     elif tuner_cfg.get("mp_degree", None):
         candidates["mp_degree"] = tuner_cfg.get("mp_degree")
     else:
         candidates["mp_degree"] = [1]
 
     if tuner_cfg.get("pp_degree", None) == "auto":
-        candidates["pp_degree"] = dist_degree("pp", num_gpus, num_nodes)
+        candidates["pp_degree"] = dist_degree(
+            "pp", num_gpus, num_nodes, tuner_cfg
+        )
     elif tuner_cfg.get("pp_degree", None):
         candidates["pp_degree"] = tuner_cfg.get("pp_degree")
     else:
         candidates["pp_degree"] = [1]
 
+    if tuner_cfg.get("vpp_degree", None) == "auto":
+        candidates["vpp_degree"] = dist_degree(
+            "vpp", num_gpus, num_nodes, tuner_cfg
+        )
+    elif tuner_cfg.get("vpp_degree", None):
+        candidates["vpp_degree"] = tuner_cfg.get("vpp_degree")
+    else:
+        candidates["vpp_degree"] = [1]
+
     if tuner_cfg.get("sharding_degree", None) == "auto":
         candidates["sharding_degree"] = dist_degree(
-            "sharding", num_gpus, num_nodes
+            "sharding", num_gpus, num_nodes, tuner_cfg
         )
     elif tuner_cfg.get("sharding_degree", None):
         candidates["sharding_degree"] = tuner_cfg.get("sharding_degree")
@@ -95,14 +162,14 @@ def default_candidates(tuner_cfg):
         candidates["sharding_degree"] = [1]
 
     if tuner_cfg.get("sharding_stage", None) == "auto":
-        candidates["sharding_stage"] = [1, 2, 3]
+        candidates["sharding_stage"] = [3, 2, 1]
     elif tuner_cfg.get("sharding_stage", None):
         candidates["sharding_stage"] = tuner_cfg.get("sharding_stage")
     else:
         candidates["sharding_stage"] = [None]
 
     if tuner_cfg.get("use_recompute", None) == "auto":
-        candidates["use_recompute"] = [False, True]
+        candidates["use_recompute"] = [True, False]
     elif tuner_cfg.get("use_recompute", None):
         candidates["use_recompute"] = tuner_cfg.get("use_recompute")
     else:
@@ -118,8 +185,8 @@ def default_candidates(tuner_cfg):
         candidates["recompute_granularity"] = [None]
 
     if tuner_cfg.get("micro_batch_size", None) == "auto":
-        candidates["micro_batch_size"] = list(
-            range(tuner_cfg["model_cfg"]["global_batch_size"], 0, -1)
+        candidates["micro_batch_size"] = dist_degree(
+            "mbs", num_gpus, num_nodes, tuner_cfg
         )
     elif tuner_cfg.get("micro_batch_size", None):
         candidates["micro_batch_size"] = tuner_cfg.get("micro_batch_size")
@@ -136,32 +203,87 @@ def search_all(tuner_cfg):
     dp_degree_candidates = candidates["dp_degree"]
     mp_degree_candidates = candidates["mp_degree"]
     pp_degree_candidates = candidates["pp_degree"]
+    vpp_degree_candidates = candidates["vpp_degree"]
     mbs_candidates = candidates["micro_batch_size"]
     sharding_stage_candidates = candidates["sharding_stage"]
     sharding_degree_candidates = candidates["sharding_degree"]
     use_recompute_candidates = candidates["use_recompute"]
     recompute_granularity_candidates = candidates["recompute_granularity"]
-    all_cfgs = list(
+
+    num_gpus = tuner_cfg["num_gpus"]
+    valid_degrees = []
+
+    for mp_degree in mp_degree_candidates:
+        degrees = []
+        if num_gpus % mp_degree != 0:
+            continue
+        degrees.append(mp_degree)
+        sharding_res = num_gpus // mp_degree
+
+        for sharding_degree in sharding_degree_candidates:
+            if sharding_res % sharding_degree != 0:
+                continue
+            degrees.append(sharding_degree)
+            pp_res = sharding_res // sharding_degree
+
+            for pp_degree in pp_degree_candidates:
+                if pp_res % pp_degree != 0:
+                    continue
+                degrees.append(pp_degree)
+                dp_res = pp_res // pp_degree
+
+                for dp_degree in dp_degree_candidates:
+                    if dp_res != dp_degree:
+                        continue
+                    degrees.append(dp_degree)
+                    assert len(degrees) == 4
+                    valid_degrees.append(copy.deepcopy(degrees))
+                    degrees.pop()
+                degrees.pop()
+            degrees.pop()
+
+    other_dim_cfgs = list(
         itertools.product(
-            dp_degree_candidates,
-            sharding_degree_candidates,
             sharding_stage_candidates,
             mbs_candidates,
-            pp_degree_candidates,
-            mp_degree_candidates,
+            vpp_degree_candidates,
             use_recompute_candidates,
             recompute_granularity_candidates,
         )
     )
+
+    all_cfgs = []
+    for valid_degree in valid_degrees:
+        for other_dim_cfg in other_dim_cfgs:
+            mp_degree, sharding_degree, pp_degree, dp_degree = valid_degree
+            (
+                sharding_stage,
+                mbs,
+                vpp,
+                use_recompute,
+                recompute_granularity,
+            ) = list(other_dim_cfg)
+            if (
+                tuner_cfg["model_cfg"]["global_batch_size"]
+                % (mbs * sharding_degree * dp_degree)
+                != 0
+            ):
+                continue
+            if tuner_cfg["model_cfg"]["num_layers"] % (pp_degree * vpp) != 0:
+                continue
+            cfg = list(valid_degree) + list(other_dim_cfg)
+            all_cfgs.append(cfg)
+
     mapping = {
-        0: "dp_degree",
+        0: "mp_degree",
         1: "sharding_degree",
-        2: "sharding_stage",
-        3: "micro_batch_size",
-        4: "pp_degree",
-        5: "mp_degree",
-        6: "use_recompute",
-        7: "recompute_granularity",
+        2: "pp_degree",
+        3: "dp_degree",
+        4: "sharding_stage",
+        5: "micro_batch_size",
+        6: "vpp_degree",
+        7: "use_recompute",
+        8: "recompute_granularity",
     }
     new_all_cfgs = []
     for cfg in all_cfgs:
@@ -172,122 +294,99 @@ def search_all(tuner_cfg):
     return new_all_cfgs
 
 
-def gen_new_args(raw_args, cfg, tuner_cfg):
+def gen_new_args(raw_args, cfg, tuner_cfg, run_best=False):
     """Generate new script args."""
-    assert "run_cmd" in tuner_cfg
-    cmd = copy.deepcopy(tuner_cfg["run_cmd"])
-    res_args = copy.deepcopy(raw_args)
-    if "dp_degree" in cmd and "dp_degree" in cfg:
-        if "--" in cmd["dp_degree"][0]:
-            cmd["dp_degree"][1] = cmd["dp_degree"][1] + str(cfg["dp_degree"])
-            res_args.extend(cmd["dp_degree"])
-        else:
-            cmd["dp_degree"][1] = (
-                cmd["dp_degree"][1] + "=" + str(cfg["dp_degree"])
-            )
-            res_args.extend(cmd["dp_degree"])
-
-    if "mp_degree" in cmd and "mp_degree" in cfg:
-        if "--" in cmd["mp_degree"][0]:
-            cmd["mp_degree"][1] = cmd["mp_degree"][1] + str(cfg["mp_degree"])
-            res_args.extend(cmd["mp_degree"])
-        else:
-            cmd["mp_degree"][1] = (
-                cmd["mp_degree"][1] + "=" + str(cfg["mp_degree"])
-            )
-            res_args.extend(cmd["mp_degree"])
 
-    if "pp_degree" in cmd and "pp_degree" in cfg:
-        if "--" in cmd["pp_degree"][0]:
-            cmd["pp_degree"][1] = cmd["pp_degree"][1] + str(cfg["pp_degree"])
-            res_args.extend(cmd["pp_degree"])
-        else:
-            cmd["pp_degree"][1] = (
-                cmd["pp_degree"][1] + "=" + str(cfg["pp_degree"])
-            )
-            res_args.extend(cmd["pp_degree"])
-
-    if "micro_batch_size" in cmd and "micro_batch_size" in cfg:
-        if "--" in cmd["micro_batch_size"][0]:
-            cmd["micro_batch_size"][1] = cmd["micro_batch_size"][1] + str(
-                cfg["micro_batch_size"]
-            )
-            res_args.extend(cmd["micro_batch_size"])
-        else:
-            cmd["micro_batch_size"][1] = (
-                cmd["micro_batch_size"][1] + "=" + str(cfg["micro_batch_size"])
-            )
-            res_args.extend(cmd["micro_batch_size"])
-
-    if "sharding_degree" in cmd and "sharding_degree" in cfg:
-        if "--" in cmd["sharding_degree"][0]:
-            cmd["sharding_degree"][1] = cmd["sharding_degree"][1] + str(
-                cfg["sharding_degree"]
-            )
-            res_args.extend(cmd["sharding_degree"])
-        else:
-            cmd["sharding_degree"][1] = (
-                cmd["sharding_degree"][1] + "=" + str(cfg["sharding_degree"])
+    def _gen_new_arg(arg, cmd, cfg, res_args, tuner_cfg):
+        if arg in cmd and arg in cfg:
+            if "--" in cmd[arg][0]:
+                cmd[arg][1] = cmd[arg][1] + str(cfg[arg])
+                res_args.extend(cmd[arg])
+            elif "-o" in cmd[arg][0]:
+                cmd[arg][1] = cmd[arg][1] + "=" + str(cfg[arg])
+                res_args.extend(cmd[arg])
+            elif ".json" in cmd[arg][0]:
+                import json
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = json.load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = cfg[arg]
+                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+            elif ".yaml" in cmd[arg][0]:
+                import yaml
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = yaml.safe_load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = cfg[arg]
+                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
+        elif arg == "local_batch_size" and arg in cmd:
+            local_batch_size = (
+                tuner_cfg["model_cfg"]["global_batch_size"]
+                // cfg["sharding_degree"]
+                // cfg["dp_degree"]
             )
-            res_args.extend(cmd["sharding_degree"])
-
-    if "sharding_stage" in cmd and "sharding_stage" in cfg:
-        if "--" in cmd["sharding_stage"][0]:
-            cmd["sharding_stage"][1] = cmd["sharding_stage"][1] + str(
-                cfg["sharding_stage"]
-            )
-            res_args.extend(cmd["sharding_stage"])
-        else:
-            cmd["sharding_stage"][1] = (
-                cmd["sharding_stage"][1] + "=" + str(cfg["sharding_stage"])
-            )
-            res_args.extend(cmd["sharding_stage"])
-
-    if "use_recompute" in cmd and "use_recompute" in cfg:
-        if "--" in cmd["use_recompute"][0]:
-            cmd["use_recompute"][1] = cmd["use_recompute"][1] + str(
-                cfg["use_recompute"]
-            )
-            res_args.extend(cmd["use_recompute"])
-        else:
-            cmd["use_recompute"][1] = (
-                cmd["use_recompute"][1] + "=" + str(cfg["use_recompute"])
-            )
-            res_args.extend(cmd["use_recompute"])
-
-    if "recompute_granularity" in cmd and "recompute_granularity" in cfg:
-        if "--" in cmd["recompute_granularity"][0]:
-            cmd["recompute_granularity"][1] = cmd["recompute_granularity"][
-                1
-            ] + str(cfg["recompute_granularity"])
-            res_args.extend(cmd["recompute_granularity"])
-        else:
-            cmd["recompute_granularity"][1] = (
-                cmd["recompute_granularity"][1]
-                + "="
-                + str(cfg["recompute_granularity"])
-            )
-            res_args.extend(cmd["recompute_granularity"])
-
-    if "local_batch_size" in cmd:
-        local_batch_size = (
-            tuner_cfg["model_cfg"]["global_batch_size"]
-            // cfg["sharding_degree"]
-            // cfg["dp_degree"]
-        )
-        if "--" in cmd["local_batch_size"][0]:
-            cmd["local_batch_size"][1] = cmd["local_batch_size"][1] + str(
-                local_batch_size
-            )
-            res_args.extend(cmd["local_batch_size"])
-        else:
-            cmd["local_batch_size"][1] = (
-                cmd["local_batch_size"][1] + "=" + str(local_batch_size)
-            )
-            res_args.extend(cmd["local_batch_size"])
-
-    if "gradient_accumulation_steps" in cmd:
-        if "--" in cmd["gradient_accumulation_steps"][0]:
+            if "--" in cmd["local_batch_size"][0]:
+                cmd["local_batch_size"][1] = cmd["local_batch_size"][1] + str(
+                    local_batch_size
+                )
+                res_args.extend(cmd["local_batch_size"])
+            elif "-o" in cmd["local_batch_size"][0]:
+                cmd["local_batch_size"][1] = (
+                    cmd["local_batch_size"][1] + "=" + str(local_batch_size)
+                )
+                res_args.extend(cmd["local_batch_size"])
+            elif ".json" in cmd[arg][0]:
+                import json
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = json.load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = local_batch_size
+                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+            elif ".yaml" in cmd[arg][0]:
+                import yaml
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = yaml.safe_load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = local_batch_size
+                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
+
+        elif arg == "gradient_accumulation_steps" and arg in cmd:
             try:
                 gradient_accumulation_steps = (
                     tuner_cfg["model_cfg"]["global_batch_size"]
@@ -295,28 +394,149 @@ def gen_new_args(raw_args, cfg, tuner_cfg):
                     // cfg["dp_degree"]
                     // cfg["micro_batch_size"]
                 )
+            except:
+                return
+            if "--" in cmd["gradient_accumulation_steps"][0]:
                 cmd["gradient_accumulation_steps"][1] = cmd[
                     "gradient_accumulation_steps"
                 ][1] + str(gradient_accumulation_steps)
                 res_args.extend(cmd["gradient_accumulation_steps"])
-            except:
-                pass
-        else:
-            try:
-                gradient_accumulation_steps = (
-                    tuner_cfg["model_cfg"]["global_batch_size"]
-                    // cfg["sharding_degree"]
-                    // cfg["dp_degree"]
-                    // cfg["micro_batch_size"]
-                )
+
+            elif "-o" in cmd["gradient_accumulation_steps"][0]:
                 cmd["gradient_accumulation_steps"][1] = (
                     cmd["gradient_accumulation_steps"][1]
                     + "="
                     + str(gradient_accumulation_steps)
                 )
                 res_args.extend(cmd["gradient_accumulation_steps"])
-            except:
-                pass
+            elif ".json" in cmd[arg][0]:
+                import json
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = json.load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = gradient_accumulation_steps
+                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+            elif ".yaml" in cmd[arg][0]:
+                import yaml
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = yaml.safe_load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = gradient_accumulation_steps
+                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
+
+    assert "run_cmd" in tuner_cfg
+    cmd = copy.deepcopy(tuner_cfg["run_cmd"])
+    res_args = copy.deepcopy(raw_args)
+
+    _gen_new_arg("dp_degree", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("mp_degree", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("pp_degree", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("vpp_degree", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("micro_batch_size", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("sharding_degree", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("sharding_stage", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("use_recompute", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("recompute_granularity", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("local_batch_size", cmd, cfg, res_args, tuner_cfg)
+    _gen_new_arg("gradient_accumulation_steps", cmd, cfg, res_args, tuner_cfg)
+
+    if tuner_cfg["run_cmd"].get("search_stage", None) and not run_best:
+        cmd = copy.deepcopy(tuner_cfg["run_cmd"]["search_stage"])
+        for arg in cmd:
+            if "--" in cmd[arg][0]:
+                res_args.extend(cmd[arg])
+            elif "-o" in cmd[arg][0]:
+                res_args.extend(cmd[arg])
+            elif ".json" in cmd[arg][0]:
+                import json
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = json.load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = cmd[arg][2]
+                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+            elif ".yaml" in cmd[arg][0]:
+                import yaml
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = yaml.safe_load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = cmd[arg][2]
+                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
+
+    if tuner_cfg["run_cmd"].get("run_best_stage", None) and run_best:
+        cmd = copy.deepcopy(tuner_cfg["run_cmd"]["run_best_stage"])
+        for arg in cmd:
+            if "--" in cmd[arg][0]:
+                res_args.extend(cmd[arg])
+            elif "-o" in cmd[arg][0]:
+                res_args.extend(cmd[arg])
+            elif ".json" in cmd[arg][0]:
+                import json
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = json.load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = cmd[arg][2]
+                json.dump(cmd_cfg, open(cmd[arg][0], "w"))
+            elif ".yaml" in cmd[arg][0]:
+                import yaml
+
+                file_path = cmd[arg][0]
+                try:
+                    with open(file_path, "r") as f:
+                        cmd_cfg = yaml.safe_load(f)
+                except:
+                    raise ValueError(
+                        "Please check your auto tuner json whether valid."
+                    )
+                keys = cmd[arg][1].split(".")
+                for key in keys[: len(keys) - 1]:
+                    cmd_cfg = cmd_cfg[key]
+                cmd_cfg[keys[-1]] = cmd[arg][2]
+                yaml.dump(cmd_cfg, open(cmd[arg][0], "w"))
 
     return res_args
 
@@ -352,7 +572,16 @@ def read_metric_log(
                 re_out_of_memory_pattern, line, re.IGNORECASE
             )
             if metric:
-                metric_list.append(float(metric[0][0]))
+                value = None
+                for item in metric[0]:
+                    try:
+                        value = float(item)
+                        metric_list.append(value)
+                        break
+                    except:
+                        continue
+                assert value is not None
+
             if out_of_memory:
                 out_of_memory_flag = 1
 
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 3cee9b9b4e143..0a7dff06dc227 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -295,6 +295,7 @@ def launch():
     elif ctx.is_auto_tuner_mode():
         import copy
         import json
+        import logging
         import os
         import sys
         import time
@@ -306,12 +307,28 @@ def launch():
 
         start_time = time.time()
         # read user defined tuner config json
+        if not ctx.args.auto_tuner_json.endswith(".json"):
+            raise ValueError("Please use '.json' as the file name suffix.")
         try:
             with open(ctx.args.auto_tuner_json, "r") as f:
                 tuner_cfg = json.load(f)
         except:
             raise ValueError("Please check your auto tuner json whether valid.")
 
+        logger = logging.getLogger('auto_tuner')
+        logger.setLevel(logging.INFO)
+        auto_tuner_log_path = os.path.join(
+            os.path.dirname(ctx.args.auto_tuner_json),
+            f'{os.path.basename(ctx.args.auto_tuner_json).split(".")[0]}_auto_tuner.log',
+        )
+        handler = logging.FileHandler(auto_tuner_log_path, mode="w")
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
         # copy training script args
         if ctx.args.training_script.endswith('.py'):
             entrypoint = [sys.executable, "-u", ctx.args.training_script]
@@ -332,7 +349,12 @@ def launch():
             nnodes = int(nnodes)
         tuner_cfg["nodes"] = nnodes
         tuner_cfg["num_gpus"] = gpus_per_node * tuner_cfg["nodes"]
+        mode = tuner_cfg.get("mode", None)
 
+        history_file_path = os.path.join(
+            os.path.dirname(ctx.args.auto_tuner_json),
+            f'{os.path.basename(ctx.args.auto_tuner_json).split(".")[0]}_history.csv',
+        )
         if nnodes > 1:
             from .utils.etcd_client import ETCDClient
 
@@ -353,6 +375,9 @@ def launch():
             else tuner_cfg.get("warmup_time")
         )
 
+        # max_search_time
+        max_search_time = tuner_cfg.get("max_search_time", None)
+
         is_first_task = True
         # build history recorder
         recorder = HistoryRecorder()
@@ -405,6 +430,11 @@ def launch():
                         task_job_id, log_dir, gbs_cur_cfg
                     )
                 )
+                logger.info(
+                    "Launch task from auto tuner: job_id {}, log_dir {}, config {}".format(
+                        task_job_id, log_dir, gbs_cur_cfg
+                    )
+                )
                 c = controllers.init(ctx)
                 c.run()
 
@@ -422,6 +452,9 @@ def launch():
                     ctx.logger.warning(
                         f"Read metric failed for parameters: {log_dir}"
                     )
+                    logger.warning(
+                        f"Read metric failed for parameters: {log_dir}"
+                    )
                     # for pruner use
                     gbs_cur_cfg['time'] = -1
                     gbs_cur_cfg[tuner_cfg['metric_cfg']['name']] = None
@@ -431,6 +464,7 @@ def launch():
                     ctx.logger.warning(
                         f"Out of memory for parameters: {log_dir}"
                     )
+                    logger.warning(f"Out of memory for parameters: {log_dir}")
                     # for pruner use
                     gbs_cur_cfg['time'] = -1
                     gbs_cur_cfg[tuner_cfg['metric_cfg']['name']] = None
@@ -441,6 +475,9 @@ def launch():
                     ctx.logger.warning(
                         f"Read memory usage failed for parameters: {log_dir}"
                     )
+                    logger.warning(
+                        f"Read memory usage failed for parameters: {log_dir}"
+                    )
                     gbs_cur_cfg["max_mem_usage"] = None
 
                 if not err:
@@ -484,26 +521,40 @@ def launch():
             ctx.logger.info(
                 f"AtuoTuner for GBS search ends in {end_time-start_time}s."
             )
+            logger.info(
+                f"AtuoTuner for GBS search ends in {end_time-start_time}s."
+            )
+
         # build AutoTuner to get new config
         auto_tuner = AutoTuner(tuner_cfg)
         cur_cfg = auto_tuner.search_once()
         auto_tuner.add_cfg(cur_cfg)
-
+        assert cur_cfg is not None, "No config can run."
         while cur_cfg:
+            task_start_time = time.time()
             ctx = copy.deepcopy(raw_ctx)
             if is_first_task:
                 ctx.max_time_per_task = warmup_time
             is_first_task = False
             # auto tuner supports dp, mp, pp, micro batch size, sharding, recompute by default and every task has own log dir
-            log_dir = "DP{}_MP{}_PP{}_Sharding_degree_{}_stage_{}_MBS_{}_Recompute_{}_granularity_{}".format(
+            acc_steps = (
+                tuner_cfg["model_cfg"]["global_batch_size"]
+                // cur_cfg["dp_degree"]
+                // cur_cfg["sharding_degree"]
+                // cur_cfg["micro_batch_size"]
+            )
+            cur_cfg["acc_steps"] = acc_steps
+            log_dir = "DP{}_MP{}_PP{}_VPP_{}_Sharding_degree_{}_stage_{}_MBS_{}_Recompute_{}_granularity_{}_AccStep_{}".format(
                 cur_cfg["dp_degree"],
                 cur_cfg["mp_degree"],
                 cur_cfg["pp_degree"],
+                cur_cfg["vpp_degree"],
                 cur_cfg["sharding_degree"],
                 cur_cfg["sharding_stage"],
                 cur_cfg["micro_batch_size"],
                 cur_cfg["use_recompute"],
                 cur_cfg["recompute_granularity"],
+                cur_cfg["acc_steps"],
             )
 
             ctx.args.log_dir = log_dir
@@ -523,9 +574,26 @@ def launch():
                     task_job_id, log_dir, cur_cfg
                 )
             )
+            logger.info(
+                "Launch task from auto tuner: job_id {}, log_dir {}, config {}".format(
+                    task_job_id, log_dir, cur_cfg
+                )
+            )
             c = controllers.init(ctx)
             c.run()
 
+            task_end_time = time.time()
+            cur_cfg["exec_time"] = round(task_end_time - task_start_time, 2)
+            ctx.logger.info(
+                "Task: job_id {}, log_dir {}, config {} ends in {}s".format(
+                    task_job_id, log_dir, cur_cfg, cur_cfg["exec_time"]
+                )
+            )
+            logger.info(
+                "Task: job_id {}, log_dir {}, config {} ends in {}s".format(
+                    task_job_id, log_dir, cur_cfg, cur_cfg["exec_time"]
+                )
+            )
             # process generated result
 
             metric, mem, err = read_log(
@@ -536,6 +604,7 @@ def launch():
             )
             # sync sigint
             timeout_flag = True
+            OOM_flag = err & (1 << 1)
 
             if nnodes > 1:
                 import socket
@@ -548,16 +617,22 @@ def launch():
                     ip = '127.0.0.1'
                 assert ip != '127.0.0.1'
                 path = f"auto_tuner/{job_id}/{ip}"
-                OOM_flag = err & (1 << 1)
                 if OOM_flag:
                     client.put(path, "OOM".encode('latin-1'))
                     ctx.logger.info(f"Put OOM to {path}")
+                    logger.info(f"Put OOM to {path}")
                 elif hasattr(c, 'sigint') and c.sigint == 14:
                     client.put(path, "OK".encode('latin-1'))
                     ctx.logger.info(f"Put OK to {path}")
+                    logger.info(f"Put OK to {path}")
+                elif not hasattr(c, 'sigint') and c.pod.exit_code == 0:
+                    client.put(path, "OK".encode('latin-1'))
+                    ctx.logger.info(f"Put OK to {path}")
+                    logger.info(f"Put OK to {path}")
                 else:
                     client.put(path, "Error".encode('latin-1'))
                     ctx.logger.info(f"Put Error to {path}")
+                    logger.info(f"Put Error to {path}")
 
                 result = list(client.get_prefix(f"auto_tuner/{job_id}/"))
                 size = len(result)
@@ -568,9 +643,11 @@ def launch():
 
                 status = [i[0].decode() for i in result]
                 ctx.logger.info(f"Status of auto_tuner/{job_id}/: {status}")
+                logger.info(f"Status of auto_tuner/{job_id}/: {status}")
 
                 if "OOM" in status:
                     timeout_flag = False
+                    OOM_flag = True
                 elif "OK" not in status:
                     timeout_flag = False
 
@@ -578,13 +655,15 @@ def launch():
                 ctx.logger.warning(
                     f"Read metric failed for parameters: {log_dir}"
                 )
+                logger.warning(f"Read metric failed for parameters: {log_dir}")
                 # for pruner use
                 cur_cfg['time'] = -1
                 cur_cfg[tuner_cfg['metric_cfg']['name']] = None
-                cur_cfg["max_mem_usage"] = mem
+                cur_cfg["max_mem_usage"] = mem if not OOM_flag else "OOM"
 
             if err & (1 << 1):
                 ctx.logger.warning(f"Out of memory for parameters: {log_dir}")
+                logger.warning(f"Out of memory for parameters: {log_dir}")
                 # for pruner use
                 cur_cfg['time'] = -1
                 cur_cfg[tuner_cfg['metric_cfg']['name']] = None
@@ -595,18 +674,21 @@ def launch():
                 ctx.logger.warning(
                     f"Read memory usage failed for parameters: {log_dir}"
                 )
-                cur_cfg["max_mem_usage"] = None
+                logger.warning(
+                    f"Read memory usage failed for parameters: {log_dir}"
+                )
+                cur_cfg["max_mem_usage"] = None if not OOM_flag else "OOM"
 
             if not err and timeout_flag:
                 # for pruner use
                 cur_cfg['time'] = metric
                 cur_cfg[tuner_cfg['metric_cfg']['name']] = metric
-                cur_cfg["max_mem_usage"] = mem
+                cur_cfg["max_mem_usage"] = mem if not OOM_flag else "OOM"
 
             if not err and not timeout_flag:
                 cur_cfg['time'] = -1
                 cur_cfg[tuner_cfg['metric_cfg']['name']] = None
-                cur_cfg["max_mem_usage"] = None
+                cur_cfg["max_mem_usage"] = None if not OOM_flag else "OOM"
 
             # record history
             cur_cfg['job_id'] = job_id
@@ -617,13 +699,15 @@ def launch():
             )
             if not err:
                 ctx.logger.info(f"Current best config: {cur_best_cfgs}")
-                recorder.store_history(
-                    ctx.args.auto_tuner_json.split(".")[0] + "_history.csv"
-                )
+                logger.info(f"Current best config: {cur_best_cfgs}")
+                recorder.store_history(history_file_path)
             else:
                 ctx.logger.info(
                     "Get best config failed. Currently there are no appropriate configs."
                 )
+                logger.info(
+                    "Get best config failed. Currently there are no appropriate configs."
+                )
             c.finalize(exit=False)
 
             # generate a new config
@@ -641,7 +725,12 @@ def launch():
                 if pid != self_pid:
                     os.system("kill -9 " + pid)
             time.sleep(3)
-        recorder.store_history()
+            end_time = time.time()
+            if max_search_time and (end_time - start_time) > int(
+                max_search_time
+            ):
+                break
+        recorder.store_history(history_file_path)
 
         # get best config to run
         best_cfg = None
@@ -662,6 +751,7 @@ def launch():
                 best_cfg, err = recorder.get_best(
                     metric=tuner_cfg['metric_cfg']['name'],
                     direction=tuner_cfg['metric_cfg']['OptimizationDirection'],
+                    mode=mode,
                 )
                 if err:
                     raise ValueError(
@@ -678,6 +768,7 @@ def launch():
                         best_cfg = json.loads(data)
                     except Exception as e:
                         ctx.logger.warning(e)
+                        logger.warning(e)
                         time.sleep(2)
                     if best_cfg:
                         break
@@ -686,21 +777,24 @@ def launch():
             best_cfg, err = recorder.get_best(
                 metric=tuner_cfg['metric_cfg']['name'],
                 direction=tuner_cfg['metric_cfg']['OptimizationDirection'],
+                mode=mode,
             )
             if err:
                 raise ValueError(
                     "Get best config failed. Currently there are no appropriate configs."
                 )
-        assert best_cfg
+        assert best_cfg and best_cfg["time"] != -1
 
         end_time = time.time()
         ctx.logger.info(f"AutoTuner ends in {end_time-start_time}s.")
+        logger.info(f"AutoTuner ends in {end_time-start_time}s.")
         # launch best cfg
-        new_args = gen_new_args(raw_args, best_cfg, tuner_cfg)
+        new_args = gen_new_args(raw_args, best_cfg, tuner_cfg, run_best=True)
         ctx.run_best = True
         ctx.args.training_script_args = new_args
         ctx.args.job_id = "best_cfg"
         ctx.logger.info(f"Launch best cfg from auto tuner: {best_cfg}")
+        logger.info(f"Launch best cfg from auto tuner: {best_cfg}")
         ctx.args.log_dir = "best_cfg"
         # run best cfg
         c = controllers.init(ctx)
diff --git a/test/auto_parallel/test_auto_tuner.py b/test/auto_parallel/test_auto_tuner.py
index ad0eb94d48c78..77d8773ddbeec 100644
--- a/test/auto_parallel/test_auto_tuner.py
+++ b/test/auto_parallel/test_auto_tuner.py
@@ -62,7 +62,7 @@ def test_auto_tuner(self):
                 "recompute_granularity": ["-o", "Model.recompute_granularity"],
             },
             "metric_cfg": {
-                "name": "step/s",
+                "name": "ms/step",
                 "OptimizationDirection": "Maximize",
             },
         }
diff --git a/test/auto_parallel/test_auto_tuner_compare.py b/test/auto_parallel/test_auto_tuner_compare.py
index b693d72493fda..6b979ba3343f8 100644
--- a/test/auto_parallel/test_auto_tuner_compare.py
+++ b/test/auto_parallel/test_auto_tuner_compare.py
@@ -62,7 +62,7 @@ def test_auto_tuner_compare(self):
                 "recompute_granularity": ["-o", "Model.recompute_granularity"],
             },
             "metric_cfg": {
-                "name": "step/s",
+                "name": "ms/step",
                 "OptimizationDirection": "Maximize",
             },
         }

From 571ff2a15331f962c112d7b9684450ef7fd34d3a Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Mon, 25 Sep 2023 21:03:47 +0800
Subject: [PATCH 111/115] [Semi-Auto] Adapt embedding spmd rule to phi (#57545)

* adapt_phi/embedding

* fix code style
---
 .../auto_parallel/spmd_rules/rules.h          |   5 -
 .../auto_parallel/inferspmd_utils.cc          |   1 +
 .../auto_parallel/inferspmd_utils.h           |   1 +
 .../infermeta/spmd_rules/embedding.cc}        | 156 ++++++++----------
 .../infermeta/spmd_rules/embedding.h}         |  31 ++--
 paddle/phi/infermeta/spmd_rules/rules.h       |  11 ++
 .../spmd_rules/test_embedding_rule.py         |  60 ++++---
 7 files changed, 135 insertions(+), 130 deletions(-)
 rename paddle/{fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.cc => phi/infermeta/spmd_rules/embedding.cc} (62%)
 rename paddle/{fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h => phi/infermeta/spmd_rules/embedding.h} (58%)

diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
index 46806ce4daab7..b1606fceb41dc 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/cross_entropy_with_softmax_spmd_rule.h"
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h"
@@ -30,10 +29,6 @@ namespace auto_parallel {
 // replicated rule
 REGISTER_SPMD_RULE(replicated, ReplicatedSPMDRule);
 
-// embedding rule
-REGISTER_SPMD_RULE(embedding, EmbeddingSPMDRule);
-REGISTER_SPMD_RULE(lookup_table_v2, EmbeddingSPMDRule);
-
 // softmax rule
 REGISTER_SPMD_RULE(softmax, SoftmaxSPMDRule);
 REGISTER_SPMD_RULE(log_softmax, SoftmaxSPMDRule);
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
index 485e2f09a42e9..56a9d618fec3e 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
@@ -55,6 +55,7 @@ AttrType InferSpmdContext::AttrAt(size_t idx) const {
 
 template float InferSpmdContext::AttrAt(size_t idx) const;
 template int InferSpmdContext::AttrAt(size_t idx) const;
+template int64_t InferSpmdContext::AttrAt(size_t idx) const;
 
 template <>
 bool InferSpmdContext::AttrAt(size_t idx) const {
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
index 499c2340983a7..4781b5d872001 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
@@ -155,6 +155,7 @@ struct InferSpmdFnImpl<Return (*)(Args...), infer_spmd_fn> {
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(bool);
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(int);
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(int64_t);
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(std::vector<int>);
   PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_CONST_ATTRIBUTE_REF(
       std::vector<int64_t>);
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.cc b/paddle/phi/infermeta/spmd_rules/embedding.cc
similarity index 62%
rename from paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.cc
rename to paddle/phi/infermeta/spmd_rules/embedding.cc
index 77a60abe090d5..b9d3ee7904ba7 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.cc
+++ b/paddle/phi/infermeta/spmd_rules/embedding.cc
@@ -12,31 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
+#include "paddle/phi/infermeta/spmd_rules/embedding.h"
 
-namespace paddle {
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
 namespace distributed {
-namespace auto_parallel {
 
 using phi::distributed::auto_parallel::str_join;
 
-// step0: verify input args based on embedding logic
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-EmbeddingSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
-                                const paddle::framework::AttributeMap& attrs) {
-  auto input_specs_size = input_specs.size();
-  PADDLE_ENFORCE_EQ(
-      input_specs_size,
-      2,
-      phi::errors::InvalidArgument(
-          "The size of InputSpec of embedding should be 2, but got [%d].",
-          input_specs_size));
-  auto x_shape = input_specs[0].shape();
-  auto weight_shape = input_specs[1].shape();
+SpmdInfo EmbeddingInferSpmd(const DistMetaTensor& x,
+                            const DistMetaTensor& weight,
+                            int padding_idx,
+                            bool sparse) {
+  // Step0: Verify input args based on embedding logic
+  auto x_shape = phi::vectorize(x.dims());
+  auto weight_shape = phi::vectorize(weight.dims());
   int x_ndim = static_cast<int>(x_shape.size());
   int weight_ndim = static_cast<int>(weight_shape.size());
-  auto x_dist_attr_src = input_specs[0].dist_attr();
-  auto weight_dist_attr_src = input_specs[1].dist_attr();
+  auto x_dist_attr_src = x.dist_attr();
+  auto weight_dist_attr_src = weight.dist_attr();
   std::vector<int64_t> x_dims_mapping = x_dist_attr_src.dims_mapping();
   std::vector<int64_t> weight_dims_mapping =
       weight_dist_attr_src.dims_mapping();
@@ -44,17 +44,17 @@ EmbeddingSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   PADDLE_ENFORCE_EQ(
       x_ndim,
       x_dims_mapping.size(),
-      phi::errors::InvalidArgument(
-          "Mismatch of X's tensor size: [%d] and X's dims_mapping size [%d].",
-          x_ndim,
-          x_dims_mapping.size()));
+      phi::errors::InvalidArgument("The Tensor X's rank [%d] and X's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   x_ndim,
+                                   x_dims_mapping.size()));
   PADDLE_ENFORCE_EQ(
       weight_ndim,
       weight_dims_mapping.size(),
-      phi::errors::InvalidArgument(
-          "Mismatch of W's tensor size: [%d] and W's dims_mapping size [%d].",
-          weight_ndim,
-          weight_dims_mapping.size()));
+      phi::errors::InvalidArgument("Tensor W's tensor rank [%d] and W's "
+                                   "dims_mapping size [%d] are not matched.",
+                                   weight_ndim,
+                                   weight_dims_mapping.size()));
   PADDLE_ENFORCE_EQ(
       weight_ndim,
       2,
@@ -62,9 +62,6 @@ EmbeddingSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
                                    "but got a tensor with [%d] dimension.",
                                    weight_ndim));
 
-  int64_t padding_idx = ExtractAttr<int64_t>("padding_idx", attrs);
-  bool sparse = ExtractAttr<bool>("sparse", attrs);
-
   // determine parallel mode
   int64_t weight_row_axis_mapping = weight_dims_mapping[0];
 
@@ -103,35 +100,25 @@ EmbeddingSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
           << "sparse: "
           << "[" << (sparse ? "true" : "false") << "]; ";
 
-  // step1: build Einsum Notation
+  // Step1: Build Einsum Notation
   std::string alphabet = "abcdefghilmnopqrstuvwxyz";
   std::string x_axes = GetBroadcastAxes(x_ndim, x_ndim, alphabet);
   std::string weight_axes = "jk";
   std::string out_axes = x_axes + "k";
 
-  // step2: Sharding Propogation
+  // Step2: Sharding Propogation
+  // Step2.1: merge input shardings
   auto axis_to_dim_map = ShardingMergeForTensors(
       {{x_axes, x_dims_mapping}, {weight_axes, weight_dims_mapping}}, false);
 
-  // step3: Infer Output's Dims Mapping.
-  TensorDistAttr output_dist_attr_dst =
-      CopyTensorDistAttrForOutput(x_dist_attr_src);
-  std::vector<int64_t> out_dims_mapping;
-  out_dims_mapping.reserve(out_axes.size());
-  for (size_t i = 0; i < out_axes.size(); ++i) {
-    out_dims_mapping.push_back(axis_to_dim_map[out_axes.substr(i, 1)]);
-  }
-  output_dist_attr_dst.set_dims_mapping(out_dims_mapping);
-
-  // step3.1: Handle Partial
-  // (TODO) support case where embedding table is partial at very beginning.
-  std::vector<int64_t> partial_on_dims;
-  if (weight_row_axis_mapping > -1) {
-    partial_on_dims.push_back(weight_row_axis_mapping);
-  }
-  output_dist_attr_dst.set_partial_status(partial_on_dims);
+  // Step2.2: infer output's dims mapping.
+  TensorDistAttr out_dist_attr = CopyTensorDistAttrForOutput(x_dist_attr_src);
+  std::vector<int64_t> out_dims_mapping =
+      GetDimsMappingForAxes(out_axes, axis_to_dim_map);
+  out_dist_attr.set_dims_mapping(out_dims_mapping);
 
-  // step4: merge potential conflict in inputs
+  // Step2.3: merge potential conflict in inputs,
+  // update input dims mapping with merged shardings.
   TensorDistAttr x_dist_attr_dst = CopyTensorDistAttrForOutput(x_dist_attr_src);
   x_dist_attr_dst.set_dims_mapping(
       GetDimsMappingForAxes(x_axes, axis_to_dim_map));
@@ -140,38 +127,39 @@ EmbeddingSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   weight_dist_attr_dst.set_dims_mapping(
       GetDimsMappingForAxes(weight_axes, axis_to_dim_map));
 
-  VLOG(4) << "EmbeddingSPMDRule InferForward: "
+  // Step3: Handle Partial
+  // (TODO) support case where embedding table is partial at very beginning.
+  std::vector<int64_t> partial_on_dims;
+  if (weight_row_axis_mapping > -1) {
+    partial_on_dims.push_back(weight_row_axis_mapping);
+  }
+  out_dist_attr.set_partial_status(partial_on_dims);
+
+  VLOG(4) << "EmbeddingInferSpmd:\n"
           << "Einsum notation: [" << x_axes << "," << weight_axes << " --> "
           << out_axes << "]. " << std::endl
           << "X shape: [" << str_join(x_shape) << "], src_dims_mapping: ["
           << str_join(x_dims_mapping) << "], dst_dims_mapping: ["
-          << str_join(x_dist_attr_dst.dims_mapping()) << "]; Y shape: ["
+          << str_join(x_dist_attr_dst.dims_mapping()) << "]\n W shape: ["
           << str_join(weight_shape) << "], src_dims_mapping: ["
           << str_join(weight_dims_mapping) << "], dst_dims_mapping: ["
           << str_join(weight_dist_attr_dst.dims_mapping())
-          << "]; Output dims_mapping: [" << str_join(out_dims_mapping)
-          << "], partial_on_dims: [" << str_join(partial_on_dims) << "]";
+          << "]\n Out dims_mapping: [" << str_join(out_dims_mapping)
+          << "], partial_on_dims: [" << str_join(partial_on_dims) << "]\n\n";
 
-  return {{x_dist_attr_dst, weight_dist_attr_dst}, {output_dist_attr_dst}};
+  return {{x_dist_attr_dst, weight_dist_attr_dst}, {out_dist_attr}};
 }
 
-std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-EmbeddingSPMDRule::InferBackward(
-    const std::vector<DistTensorSpec>& input_specs,
-    const std::vector<DistTensorSpec>& output_specs,
-    const paddle::framework::AttributeMap& attrs) {
+SpmdInfo EmbeddingInferSpmdReverse(const DistMetaTensor& x,
+                                   const DistMetaTensor& weight,
+                                   const DistMetaTensor& out,
+                                   int padding_idx,
+                                   bool sparse) {
+  // Step0: Verify input args based on embedding logic
   // InferBackward is called after InferForward, so we skip some checks.
-  auto output_specs_size = output_specs.size();
-  PADDLE_ENFORCE_EQ(
-      output_specs_size,
-      1,
-      phi::errors::InvalidArgument(
-          "The size of OutputSpec of embedding should be 1, but got [%d].",
-          output_specs_size));
-
-  auto x_shape = input_specs[0].shape();
+  auto x_shape = phi::vectorize(x.dims());
   int x_ndim = static_cast<int>(x_shape.size());
-  auto out_shape = output_specs[0].shape();
+  auto out_shape = phi::vectorize(out.dims());
   int out_ndim = static_cast<int>(out_shape.size());
 
   PADDLE_ENFORCE_EQ(x_ndim,
@@ -182,10 +170,10 @@ EmbeddingSPMDRule::InferBackward(
                         x_ndim,
                         out_ndim));
 
-  auto out_dist_attr_src = output_specs[0].dist_attr();
+  auto out_dist_attr_src = out.dist_attr();
   std::vector<int64_t> out_dims_mapping = out_dist_attr_src.dims_mapping();
 
-  // step1: build Einsum Notation
+  // Step1: Build Einsum Notation
   std::string alphabet = "abcdefghilmnopqrstuvwxyz";
   std::string x_axes = GetBroadcastAxes(out_ndim - 1, out_ndim - 1, alphabet);
   std::string weight_axes = "jk";
@@ -195,32 +183,30 @@ EmbeddingSPMDRule::InferBackward(
   // should not use input dims mapping for backward sharding merge
   auto axis_to_dim_map =
       ShardingMergeForTensors({{out_axes, out_dims_mapping}}, false);
-  TensorDistAttr x_dist_attr_dst =
-      CopyTensorDistAttrForOutput(input_specs[0].dist_attr());
-  x_dist_attr_dst.set_dims_mapping(GetDimsMappingForAxes(
+  TensorDistAttr x_dist_attr = CopyTensorDistAttrForOutput(x.dist_attr());
+  x_dist_attr.set_dims_mapping(GetDimsMappingForAxes(
       x_axes, axis_to_dim_map, /*unsharded_miss_axis=*/true));
-  TensorDistAttr weight_dist_attr_dst =
-      CopyTensorDistAttrForOutput(input_specs[1].dist_attr());
-  weight_dist_attr_dst.set_dims_mapping(GetDimsMappingForAxes(
+  TensorDistAttr weight_dist_attr =
+      CopyTensorDistAttrForOutput(weight.dist_attr());
+  weight_dist_attr.set_dims_mapping(GetDimsMappingForAxes(
       weight_axes, axis_to_dim_map, /*unsharded_miss_axis=*/true));
 
   // step3: Handle Partial
   // NOTE we skip the partial backward inference in Partial Stage-I.
   // output partial --> weight sharded on first axis.
 
-  VLOG(4) << "EmbeddingSPMDRule InferBackward: "
+  VLOG(4) << "EmbeddingInferSpmdReverse:\n"
           << "Einsum notation: [" << x_axes << "," << weight_axes << " --> "
           << out_axes << "]. " << std::endl
           << "Out shape: [" << str_join(out_shape) << "], src_dims_mapping: ["
           << str_join(out_dims_mapping) << "], dst_dims_mapping: ["
-          << str_join(out_dims_mapping) << "]; Input X dims_mapping: ["
-          << str_join(x_dist_attr_dst.dims_mapping())
-          << "], Input Weight dims_mapping:["
-          << str_join(weight_dist_attr_dst.dims_mapping()) << "].";
+          << str_join(out_dims_mapping) << "]\n Input X dims_mapping: ["
+          << str_join(x_dist_attr.dims_mapping())
+          << "]\n Input Weight dims_mapping:["
+          << str_join(weight_dist_attr.dims_mapping()) << "]\n\n";
 
-  return {{x_dist_attr_dst, weight_dist_attr_dst}, {out_dist_attr_src}};
+  return {{x_dist_attr, weight_dist_attr}, {out_dist_attr_src}};
 }
 
-}  // namespace auto_parallel
 }  // namespace distributed
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h b/paddle/phi/infermeta/spmd_rules/embedding.h
similarity index 58%
rename from paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h
rename to paddle/phi/infermeta/spmd_rules/embedding.h
index cf90a9de0e0d8..6b1d3614442bd 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h
+++ b/paddle/phi/infermeta/spmd_rules/embedding.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
 
-namespace paddle {
+namespace phi {
 namespace distributed {
-namespace auto_parallel {
 
 // (TODO) Support 3 parallel cases for embedding:
 // 1. Batch dimensions of input ids is sharded on mesh.
@@ -26,17 +26,16 @@ namespace auto_parallel {
 // change the embedding kernel for miss ids.)
 // 3. Column-wise Parallel of embedding table.
 // 4. Hybrid Parallelism of above 3 cases.
-class EmbeddingSPMDRule : public SPMDRuleBase {
- public:
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferForward(const std::vector<DistTensorSpec>& input_specs,
-               const paddle::framework::AttributeMap& attrs) override;
-
-  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferBackward(const std::vector<DistTensorSpec>& input_specs,
-                const std::vector<DistTensorSpec>& output_specs,
-                const paddle::framework::AttributeMap& attrs) override;
-};
-}  // namespace auto_parallel
+SpmdInfo EmbeddingInferSpmd(const DistMetaTensor& x,
+                            const DistMetaTensor& weight,
+                            int padding_idx,
+                            bool sparse);
+
+SpmdInfo EmbeddingInferSpmdReverse(const DistMetaTensor& x,
+                                   const DistMetaTensor& weight,
+                                   const DistMetaTensor& out,
+                                   int padding_idx,
+                                   bool sparse);
+
 }  // namespace distributed
-}  // namespace paddle
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index 0d14f7da7abe9..32a45d4dd8b3c 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h"
 #include "paddle/phi/infermeta/spmd_rules/elementwise.h"
+#include "paddle/phi/infermeta/spmd_rules/embedding.h"
 #include "paddle/phi/infermeta/spmd_rules/layer_norm.h"
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
 #include "paddle/phi/infermeta/spmd_rules/reduction.h"
@@ -470,5 +471,15 @@ PD_REGISTER_SPMD_RULE(reshape,
                       PD_INFER_SPMD(phi::distributed::ReshapeInferSpmd),
                       PD_INFER_SPMD(phi::distributed::ReshapeInferSpmdReverse));
 
+// embedding rule
+PD_REGISTER_SPMD_RULE(
+    embedding,
+    PD_INFER_SPMD(phi::distributed::EmbeddingInferSpmd),
+    PD_INFER_SPMD(phi::distributed::EmbeddingInferSpmdReverse));
+PD_REGISTER_SPMD_RULE(
+    lookup_table_v2,
+    PD_INFER_SPMD(phi::distributed::EmbeddingInferSpmd),
+    PD_INFER_SPMD(phi::distributed::EmbeddingInferSpmdReverse));
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/test/auto_parallel/spmd_rules/test_embedding_rule.py b/test/auto_parallel/spmd_rules/test_embedding_rule.py
index 747b0b7c19e8b..d4e4ca551487a 100644
--- a/test/auto_parallel/spmd_rules/test_embedding_rule.py
+++ b/test/auto_parallel/spmd_rules/test_embedding_rule.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 
 import unittest
+from collections import OrderedDict
 
-from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
 from paddle.distributed.auto_parallel.static.dist_attribute import (
     DistTensorSpec,
     TensorDistAttr,
 )
 from paddle.distributed.fleet import auto
+from paddle.framework import core
 
 
 class TestEmbeddingSPMDRule(unittest.TestCase):
     def setUp(self):
-        self.rule1 = get_spmd_rule("lookup_table_v2")
+        self.rule1 = core.get_phi_spmd_rule("lookup_table_v2")
 
     def test_embedding_infer_forward(self):
         # forward setup
@@ -42,16 +43,16 @@ def test_embedding_infer_forward(self):
             table_shape, table_tensor_dist_attr
         )
 
-        self.attrs = {
-            'padding_idx': -1,
-            'sparse': False,
-        }
+        self.attrs = OrderedDict([('padding_idx', -1), ('sparse', False)])
 
         # data parallel
         self.x_dist_tensor_spec.set_dims_mapping([1, -1])
         self.table_dist_tensor_spec.set_dims_mapping([-1, -1])
         result_dist_attrs = self.rule1.infer_forward(
-            [self.x_dist_tensor_spec, self.table_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.table_dist_tensor_spec,
+            self.attrs['padding_idx'],
+            self.attrs['sparse'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -68,7 +69,10 @@ def test_embedding_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([1, -1])
         self.table_dist_tensor_spec.set_dims_mapping([-1, 0])
         result_dist_attrs = self.rule1.infer_forward(
-            [self.x_dist_tensor_spec, self.table_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.table_dist_tensor_spec,
+            self.attrs['padding_idx'],
+            self.attrs['sparse'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -81,7 +85,10 @@ def test_embedding_infer_forward(self):
         self.x_dist_tensor_spec.set_dims_mapping([1, -1])
         self.table_dist_tensor_spec.set_dims_mapping([0, -1])
         result_dist_attrs = self.rule1.infer_forward(
-            [self.x_dist_tensor_spec, self.table_dist_tensor_spec], self.attrs
+            self.x_dist_tensor_spec,
+            self.table_dist_tensor_spec,
+            self.attrs['padding_idx'],
+            self.attrs['sparse'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -109,8 +116,10 @@ def test_embedding_infer_forward(self):
         self.attrs['sparse'] = True
         with self.assertRaises(ValueError):
             result_dist_attrs = self.rule1.infer_forward(
-                [self.x_dist_tensor_spec, self.table_dist_tensor_spec],
-                self.attrs,
+                self.x_dist_tensor_spec,
+                self.table_dist_tensor_spec,
+                self.attrs['padding_idx'],
+                self.attrs['sparse'],
             )
 
     def test_embedding_infer_backward(self):
@@ -141,17 +150,16 @@ def test_embedding_infer_backward(self):
             out_shape, out_tensor_dist_attr
         )
 
-        self.attrs = {
-            'padding_idx': -1,
-            'sparse': False,
-        }
+        self.attrs = OrderedDict([('padding_idx', -1), ('sparse', False)])
 
         # data parallel
         self.out_dist_tensor_spec.set_dims_mapping([1, -1, -1])
         result_dist_attrs = self.rule1.infer_backward(
-            [self.x_dist_tensor_spec, self.table_dist_tensor_spec],
-            [self.out_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.table_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['padding_idx'],
+            self.attrs['sparse'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -167,9 +175,11 @@ def test_embedding_infer_backward(self):
         # table col-wise parallel & dp
         self.out_dist_tensor_spec.set_dims_mapping([-1, 0, 1])
         result_dist_attrs = self.rule1.infer_backward(
-            [self.x_dist_tensor_spec, self.table_dist_tensor_spec],
-            [self.out_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.table_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['padding_idx'],
+            self.attrs['sparse'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]
@@ -182,9 +192,11 @@ def test_embedding_infer_backward(self):
         self.out_dist_tensor_spec.set_dims_mapping([1, 0, -1])
 
         result_dist_attrs = self.rule1.infer_backward(
-            [self.x_dist_tensor_spec, self.table_dist_tensor_spec],
-            [self.out_dist_tensor_spec],
-            self.attrs,
+            self.x_dist_tensor_spec,
+            self.table_dist_tensor_spec,
+            self.out_dist_tensor_spec,
+            self.attrs['padding_idx'],
+            self.attrs['sparse'],
         )
         infered_input_dist_attrs = result_dist_attrs[0]
         infered_output_dist_attrs = result_dist_attrs[1]

From 16a45d79585c97821b3ffe4d00c56b0dd5bb7dcf Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Mon, 25 Sep 2023 21:08:08 +0800
Subject: [PATCH 112/115] Add IsMean template parameter for compile (#57558)

---
 .../elementwise/elementwise_op_function.h     |  2 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |  2 +-
 .../fluid/operators/reduce_ops/reduce_op.cu.h | 53 --------------
 paddle/phi/kernels/funcs/reduce_function.h    | 73 +++++++++++++------
 paddle/phi/kernels/fusion/gpu/attn_gemm.h     | 22 ++----
 paddle/phi/kernels/gpu/mean_all_kernel.cu     | 13 ++--
 paddle/phi/kernels/gpu/reduce.h               | 30 +++-----
 .../phi/kernels/gpu/reduce_amin_amax_common.h |  1 -
 .../phi/kernels/gpu/squared_l2_norm_kernel.cu |  2 +-
 paddle/phi/kernels/kps/reduce_kernel.cu       |  4 +-
 10 files changed, 76 insertions(+), 126 deletions(-)
 delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_op.cu.h

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index c69acb89750c9..4894dff4b971c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -41,9 +41,9 @@ limitations under the License. */
 #include <thrust/iterator/iterator_adaptor.h>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 
 #endif
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 53001b2493084..8ea1e11cd29f4 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -33,8 +33,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/phi/kernels/funcs/fast_divmod.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
deleted file mode 100644
index 21646d08db396..0000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <numeric>
-#include <set>
-#include <vector>
-
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/reduce_function.h"
-namespace paddle {
-namespace operators {
-
-template <typename Tx,
-          typename Ty,
-          template <typename>
-          class ReduceBaseOp,
-          typename TransformOp>
-void TensorReduceImpl(const phi::GPUContext& dev_ctx,
-                      const phi::DenseTensor& x,
-                      phi::DenseTensor* y,
-                      const TransformOp& transform,
-                      const std::vector<int>& origin_reduce_dims,
-                      gpuStream_t stream,
-                      bool is_mean = false) {
-  y->mutable_data<Ty>(x.place());
-
-  phi::funcs::ReduceKernel<Tx, Ty, ReduceBaseOp, TransformOp>(
-      static_cast<const phi::GPUContext&>(dev_ctx),
-      x,
-      y,
-      transform,
-      origin_reduce_dims,
-      is_mean);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 16e46fc201ea8..a755cc83bba65 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -925,6 +925,7 @@ static void LaunchReduceKernel(const Tx* x_data,
 }
 
 #if !defined(PADDLE_WITH_XPU_KP)
+
 template <typename Tx,
           typename Ty,
           template <typename>
@@ -983,7 +984,6 @@ CubTensorReduceImpl(const Tx* x_data,
   PADDLE_THROW(phi::errors::InvalidArgument(
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
-
 template <typename Tx,
           typename Ty,
           template <typename>
@@ -1002,17 +1002,53 @@ CubTensorReduceImpl(const Tx* x_data,
 }
 #endif  // PADDLE_WITH_XPU_KP
 
+template <typename Tx,
+          typename Ty,
+          template <typename>
+          class ReduceOp,
+          typename TransformOp,
+          bool IsMean = false>
+struct CubTensorReduce {
+  static void apply(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const KPDevice& dev_ctx,
+                    KPStream stream) {
+    CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
+        x_data, y_data, transform, reduce_num, dev_ctx, stream);
+  }
+};
+
 template <typename Tx,
           typename Ty,
           template <typename>
           class ReduceOp,
           typename TransformOp>
+struct CubTensorReduce<Tx, Ty, ReduceOp, TransformOp, true> {
+  static void apply(const Tx* x_data,
+                    Ty* y_data,
+                    const TransformOp& transform,
+                    int reduce_num,
+                    const KPDevice& dev_ctx,
+                    KPStream stream) {
+    using Div = kps::DivideFunctor<Tx>;
+    CubTensorReduceImpl<Tx, Ty, ReduceOp, Div>(
+        x_data, y_data, Div(reduce_num), reduce_num, dev_ctx, stream);
+  }
+};
+
+template <typename Tx,
+          typename Ty,
+          template <typename>
+          class ReduceOp,
+          typename TransformOp,
+          bool IsMean = false>
 void ReduceKernel(const KPDevice& dev_ctx,
                   const phi::DenseTensor& x,
                   phi::DenseTensor* y,
                   const TransformOp& transform,
-                  const std::vector<int>& origin_reduce_dims,
-                  bool is_mean = false) {
+                  const std::vector<int>& origin_reduce_dims) {
   PADDLE_ENFORCE_GT(
       x.numel(),
       0,
@@ -1061,18 +1097,8 @@ void ReduceKernel(const KPDevice& dev_ctx,
   bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16;
 #ifndef PADDLE_WITH_XPU_KP
   if (use_cub_reduce) {
-    if (is_mean) {
-      using Div = kps::DivideFunctor<Tx>;
-      CubTensorReduceImpl<Tx, Ty, ReduceOp, Div>(x_data,
-                                                 y_data,
-                                                 Div(config.reduce_num),
-                                                 config.reduce_num,
-                                                 dev_ctx,
-                                                 stream);
-    } else {
-      CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
-          x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
-    }
+    CubTensorReduce<Tx, Ty, ReduceOp, TransformOp, IsMean>::apply(
+        x_data, y_data, transform, config.reduce_num, dev_ctx, stream);
     return;
   }
 #endif
@@ -1115,7 +1141,7 @@ void ReduceKernel(const KPDevice& dev_ctx,
             config.blocking_size,
             dim,
             config.reduce_num,
-            is_mean && (!config.should_reduce_again),
+            IsMean && (!config.should_reduce_again),
             config.tmp_data,
             config.should_reduce_again);
 
@@ -1149,7 +1175,7 @@ void ReduceKernel(const KPDevice& dev_ctx,
               config.grid.y,
               dim2,
               config.reduce_num,
-              is_mean,
+              IsMean,
               config.tmp_data,
               false);
     }
@@ -1167,29 +1193,28 @@ void ReduceKernel(const KPDevice& dev_ctx,
       reducer.initial(),
       stream,
       config,
-      is_mean);
+      IsMean);
 }
 
 template <typename Tx,
           typename Ty,
           template <typename>
           class ReduceOp,
-          typename TransformOp>
+          typename TransformOp,
+          bool IsMean = false>
 void TensorReduceImpl(const phi::GPUContext& dev_ctx,
                       const phi::DenseTensor& x,
                       phi::DenseTensor* y,
                       const TransformOp& transform,
                       const std::vector<int>& origin_reduce_dims,
-                      gpuStream_t stream,
-                      bool is_mean = false) {
+                      gpuStream_t stream) {
   dev_ctx.template Alloc<Ty>(y);
-  ReduceKernel<Tx, Ty, ReduceOp, TransformOp>(
+  ReduceKernel<Tx, Ty, ReduceOp, TransformOp, IsMean>(
       static_cast<const phi::GPUContext&>(dev_ctx),
       x,
       y,
       transform,
-      origin_reduce_dims,
-      is_mean);
+      origin_reduce_dims);
 }
 
 #endif
diff --git a/paddle/phi/kernels/fusion/gpu/attn_gemm.h b/paddle/phi/kernels/fusion/gpu/attn_gemm.h
index 8b83ddab93b9b..27d972bc1d740 100644
--- a/paddle/phi/kernels/fusion/gpu/attn_gemm.h
+++ b/paddle/phi/kernels/fusion/gpu/attn_gemm.h
@@ -28,6 +28,7 @@
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
 namespace fusion {
@@ -259,23 +260,12 @@ class AttnMatMul {
 
       gpuStream_t stream = dev_ctx_.stream();
       if (support_case_1 || support_case_2) {
-        phi::funcs::
-            TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-                dev_ctx_,
-                *d_output,
-                d_bias,
-                kps::IdentityFunctor<T>(),
-                {0, 1},
-                stream);
+        phi::SumKernel<T, phi::GPUContext>(
+            dev_ctx_, *d_output, {0, 1}, d_output->dtype(), false, d_bias);
+
       } else if (support_case_3 || support_case_4) {
-        phi::funcs::
-            TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-                dev_ctx_,
-                *d_output,
-                d_bias,
-                kps::IdentityFunctor<T>(),
-                {0, 1, 2},
-                stream);
+        phi::SumKernel<T, phi::GPUContext>(
+            dev_ctx_, *d_output, {0, 1, 2}, d_output->dtype(), false, d_bias);
       } else {
         PADDLE_THROW(phi::errors::InvalidArgument(
             "Only support reduce when the input dims are [0,1,2,3,4] and "
diff --git a/paddle/phi/kernels/gpu/mean_all_kernel.cu b/paddle/phi/kernels/gpu/mean_all_kernel.cu
index 4f85a89047aed..82405d964737c 100644
--- a/paddle/phi/kernels/gpu/mean_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/mean_all_kernel.cu
@@ -43,13 +43,12 @@ void MeanAllKernel(const Context& dev_ctx,
   for (decltype(rank) i = 0; i < rank; ++i) {
     reduce_dims.push_back(i);
   }
-  funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-      dev_ctx,
-      x,
-      out,
-      kps::IdentityFunctor<T>(),
-      reduce_dims,
-      /*is_mean=*/true);
+  funcs::ReduceKernel<T,
+                      T,
+                      kps::AddFunctor,
+                      kps::IdentityFunctor<T>,
+                      /*is_mean*/ true>(
+      dev_ctx, x, out, kps::IdentityFunctor<T>(), reduce_dims);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index cc3cad38f46fb..79c7381edab19 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -27,15 +27,15 @@ template <typename T,
           template <typename>
           class ReduceOp,
           template <typename, typename>
-          class TransformOp>
+          class TransformOp,
+          bool IsMean = false>
 void Reduce(const KPDevice& dev_ctx,
             const DenseTensor& x,
             bool reduce_all,
             const std::vector<int64_t>& dims,
             bool keep_dim,
             DataType out_dtype,
-            DenseTensor* out,
-            bool is_mean = false) {
+            DenseTensor* out) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
   std::vector<int> reduce_dims =
       phi::funcs::details::GetReduceDim(dims, x.dims().size(), reduce_all);
@@ -59,33 +59,23 @@ void Reduce(const KPDevice& dev_ctx,
           phi::funcs::ReduceKernel<data_t,
                                    data_t,
                                    ReduceOp,
-                                   TransformOp<data_t, MPType>>(
+                                   TransformOp<data_t, MPType>,
+                                   IsMean>(
               dev_ctx,
               tmp_tensor,
               out,
               TransformOp<data_t, MPType>(reduce_num),
-              reduce_dims,
-              is_mean);
+              reduce_dims);
         }));
   } else {
     using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-    phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
-        dev_ctx,
-        x,
-        out,
-        TransformOp<T, MPType>(reduce_num),
-        reduce_dims,
-        is_mean);
+    phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>, IsMean>(
+        dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims);
   }
 #else
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
-  phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
-      dev_ctx,
-      x,
-      out,
-      TransformOp<T, MPType>(reduce_num),
-      reduce_dims,
-      is_mean);
+  phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>, IsMean>(
+      dev_ctx, x, out, TransformOp<T, MPType>(reduce_num), reduce_dims);
 #endif
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
index 8964c2547886b..b04267030b284 100644
--- a/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
+++ b/paddle/phi/kernels/gpu/reduce_amin_amax_common.h
@@ -90,7 +90,6 @@ void ReduceCudaAMaxAMinGrad(const Context& dev_ctx,
                              equal_out_tensor.dtype(),
                              false,
                              equal_count);
-
   // 3. dx = dout * 1
   phi::MultiplyKernel<T, Context>(
       dev_ctx, new_dout, equal_out_tensor, &equal_out_tensor);
diff --git a/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu b/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu
index 38f84bd5d0d9d..7f8e985695818 100644
--- a/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/squared_l2_norm_kernel.cu
@@ -30,7 +30,7 @@ void SquaredL2NormKernel(const Context& dev_ctx,
     origin_reduce_dims.push_back(i);
   }
   phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::SquareFunctor<T, T>>(
-      dev_ctx, x, out, kps::SquareFunctor<T, T>(), origin_reduce_dims, false);
+      dev_ctx, x, out, kps::SquareFunctor<T, T>(), origin_reduce_dims);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/kps/reduce_kernel.cu b/paddle/phi/kernels/kps/reduce_kernel.cu
index d5d0fd9de3b28..3440b53c68b48 100644
--- a/paddle/phi/kernels/kps/reduce_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_kernel.cu
@@ -116,8 +116,8 @@ void MeanRawKernel(const Context& dev_ctx,
                    DenseTensor* out) {
   reduce_all = recompute_reduce_all(x, dims, reduce_all);
   auto out_dtype = x.dtype();
-  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out, true);
+  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor, true>(
+      dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
 }
 
 template <typename T, typename Context>

From 09321420ce60c13f4e1e05b35b23b0f6dfddfcb5 Mon Sep 17 00:00:00 2001
From: Charles-hit <56987902+Charles-hit@users.noreply.github.com>
Date: Mon, 25 Sep 2023 21:22:41 +0800
Subject: [PATCH 113/115] [PRIM][PIR]Support optional and more vjp gen (#57693)

* Support optional input and output for pir api

* Polish gen code

* Fix py3 ci compile error

* Fix code

* Fix error

* Fix windows ci error

* support optional in vjp and add more ops vjp gen

---------

Co-authored-by: 0x45f <wangzhen45@baidu.com>
---
 paddle/fluid/operators/generator/filters.py   |   5 +-
 .../fluid/operators/generator/type_mapping.py |   5 +
 paddle/fluid/primitive/codegen/gen.py         | 108 +++++++++++++++++-
 .../backend/generated/generated_backend.h.j2  |  10 +-
 .../generated/generated_eager_backend.cc.j2   |  12 +-
 .../generated/generated_static_backend.cc.j2  |  64 +++++++++--
 .../primitive/codegen/templates/common.j2     |   8 +-
 .../templates/primitive/primitive.h.j2        |   8 +-
 8 files changed, 195 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/generator/filters.py b/paddle/fluid/operators/generator/filters.py
index 23f6540a41269..9f2bab388a558 100644
--- a/paddle/fluid/operators/generator/filters.py
+++ b/paddle/fluid/operators/generator/filters.py
@@ -24,6 +24,7 @@
     input_types_map,
     opmaker_attr_types_map,
     optional_input_types_map,
+    optional_output_type_map,
     output_type_map,
     phi_attr_types_map,
     sr_output_types_map,
@@ -154,7 +155,9 @@ def delete_last_underline(op_name):
 
 
 # ------------------------------ output  ----------------------------------
-def to_paddle_output_type(s):
+def to_paddle_output_type(s, optional=False):
+    if optional:
+        return optional_output_type_map[s]
     return output_type_map[s]
 
 
diff --git a/paddle/fluid/operators/generator/type_mapping.py b/paddle/fluid/operators/generator/type_mapping.py
index a2e1b8efb2565..8d3a4933c3bd0 100644
--- a/paddle/fluid/operators/generator/type_mapping.py
+++ b/paddle/fluid/operators/generator/type_mapping.py
@@ -85,6 +85,11 @@
     'SelectedRows': 'SelectedRows',
 }
 
+optional_output_type_map = {
+    'Tensor': 'const paddle::optional<Tensor>&',
+    'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
+}
+
 # ------------------------------ phi attr ------------------------------
 phi_attr_types_map = attr_types_map.copy()
 phi_attr_types_map.update(
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 0d1f444df2bef..0865e01cdca17 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -78,8 +78,60 @@
     'unsqueeze_grad',
     'poisson_grad',
     'gumbel_softmax_grad',
-    'squeeze_grad',
-    'unsqueeze_grad',
+    'conv2d_grad',
+    'depthwise_conv2d_grad',
+    'sqrt_grad',
+    'flatten_grad',
+    'relu_grad',
+    'abs_grad',
+    'log_grad',
+    'clip_grad',
+    'ceil_grad',
+    'frobenius_norm_grad',
+    'p_norm_grad',
+    'maximum_grad',
+    'argsort_grad',
+    'min_grad',
+    'batch_norm_grad',
+    'max_pool2d_with_index_grad',
+    'pool2d_grad',
+    'minimum_grad',
+    'prod_grad',
+    'round_grad',
+    'sin_grad',
+    'cos_grad',
+    'dot_grad',
+    'floor_grad',
+    'topk_grad',
+    'square_grad',
+    'gather_grad',
+    'label_smooth_grad',
+    'cross_entropy_with_softmax_grad',
+    'mean_all_grad',
+    'cumsum_grad',
+    'linear_interp_grad',
+    'bilinear_interp_grad',
+    'trilinear_interp_grad',
+    'nearest_interp_grad',
+    'bicubic_interp_grad',
+    'assign_grad',
+    'assign_out__grad',
+    'real_grad',
+    'flip_grad',
+    'softmax_grad',
+    'expand_grad',
+    'conv2d_transpose_grad',
+    'depthwise_conv2d_transpose_grad',
+    'sigmoid_grad',
+    'pad_grad',
+    'pad3d_grad',
+    'einsum_grad',
+    'leaky_relu_grad',
+    'log10_grad',
+    'conv3d_grad',
+    'solve_grad',
+    'diag_grad',
+    'trace_grad',
 ]
 
 
@@ -183,6 +235,58 @@
     'stack_grad',
     'squeeze_grad',
     'unsqueeze_grad',
+    'conv2d_grad',
+    'depthwise_conv2d_grad',
+    'sqrt_grad',
+    'flatten_grad',
+    'relu_grad',
+    'abs_grad',
+    'log_grad',
+    'clip_grad',
+    'ceil_grad',
+    'frobenius_norm_grad',
+    'p_norm_grad',
+    'maximum_grad',
+    'argsort_grad',
+    'min_grad',
+    'batch_norm_grad',
+    'max_pool2d_with_index_grad',
+    'pool2d_grad',
+    'minimum_grad',
+    'prod_grad',
+    'round_grad',
+    'sin_grad',
+    'cos_grad',
+    'dot_grad',
+    'floor_grad',
+    'topk_grad',
+    'square_grad',
+    'gather_grad',
+    'label_smooth_grad',
+    'cross_entropy_with_softmax_grad',
+    'mean_all_grad',
+    'cumsum_grad',
+    'linear_interp_grad',
+    'bilinear_interp_grad',
+    'trilinear_interp_grad',
+    'nearest_interp_grad',
+    'bicubic_interp_grad',
+    'assign_out__grad',
+    'real_grad',
+    'softmax_grad',
+    'conv2d_transpose_grad',
+    'depthwise_conv2d_transpose_grad',
+    'sigmoid_grad',
+    'pad_grad',
+    'pad3d_grad',
+    'einsum_grad',
+    'leaky_relu_grad',
+    'log10_grad',
+    'conv3d_grad',
+    'solve_grad',
+    'diag_grad',
+    'trace_grad',
+    'flip',
 ]
 
 
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
index 663467af25a97..25443f52fe8af 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
@@ -21,11 +21,17 @@ using DataType = phi::DataType;
 
 {% for api in apis %}
   {%- if api.name in backend_white_list -%}
+  {% set inplace_map = {} %}
+  {% if 'inplace' in api and api.inplace != None %}
+    {% for source, target in api.inplace.items() %}
+      {% do inplace_map.update({source: target}) %}
+    {% endfor %}
+  {% endif %}
   {% if api.attrs is exist_mutable_attribute %}
-{{common.sig(api.name, api.inputs, api.outputs|trip_intermediate , api.attrs, True, True)}};
+{{common.sig(api.name, api.inputs, api.outputs|trip_intermediate , api.attrs, inplace_map, True, True)}};
 
   {% endif %}
-{{common.sig(api.name, api.inputs, api.outputs|trip_intermediate , api.attrs, False, True)}};
+{{common.sig(api.name, api.inputs, api.outputs|trip_intermediate , api.attrs, inplace_map, False, True)}};
 
   {% endif %}
 {% endfor %}
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
index 4c2f02224e2f7..34e427f0c2e03 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_eager_backend.cc.j2
@@ -16,9 +16,9 @@ namespace backend {
   {{common.sequence('', '', ', ', attrs)}}
 {%- endmacro -%}
 
-{%- macro sig(name, inputs, attrs, outputs) -%}
+{%- macro sig(name, inputs, attrs, outputs, inplace_map) -%}
 template <>
-{{common.ret(outputs)}} {{name}}<Tensor>({{common.params(inputs, attrs, False)}})
+{{common.ret(outputs, inplace_map)}} {{name}}<Tensor>({{common.params(inputs, attrs, False)}})
 {%- endmacro -%}
 
 {% macro body(name, inputs, attrs, outputs) %}
@@ -35,7 +35,13 @@ return ::{{name}}_ad_func({{common.args(input_names, attr_names)}});
 
 {% for api in apis %}
   {%- if api.is_prim and api.name in backend_white_list -%}
-{{sig(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} {
+  {% set inplace_map = {} %}
+  {% if 'inplace' in api and api.inplace != None %}
+    {% for source, target in api.inplace.items() %}
+      {% do inplace_map.update({source: target}) %}
+    {% endfor %}
+  {% endif %}
+{{sig(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate, inplace_map)}} {
 {{body(api.name, api.inputs, api.attrs, api.outputs | trip_intermediate)}} 
 }
 
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
index e02dea46feda7..152cd241ad833 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
@@ -12,9 +12,9 @@ namespace backend {
 
 using LazyTensor = paddle::primitive::LazyTensor;
 
-{%- macro sig(name, inputs, outputs, attrs, mutable_attribute_as_inputs=False) -%}
+{%- macro sig(name, inputs, outputs, attrs, inplace_map, mutable_attribute_as_inputs=False) -%}
 template <>
-{{common.ret(outputs)}} {{name}}<LazyTensor>({{common.params(inputs, attrs, mutable_attribute_as_inputs, False)}})
+{{common.ret(outputs, inplace_map)}} {{name}}<LazyTensor>({{common.params(inputs, attrs, mutable_attribute_as_inputs, False)}})
 {%- endmacro -%}
 
 {%- macro prepare_ir_api_inputs(inputs)-%}
@@ -25,18 +25,22 @@ std::transform({{input.name}}.begin(), {{input.name}}.end(), {{input.name}}_res.
   return std::static_pointer_cast<LazyTensor>(t.impl())->value();
 });
     {% elif input.typename=='Tensor[]' and input.optional %}
-std::vector<pir::Value> {{input.name}}_res({{input.name}}.size());
+paddle::optional<std::vector<pir::Value>> {{input.name}}_res;
 if({{input.name}}) {
-  std::transform({{input.name}}.get().begin(), {{input.name}}.get().end(), {{input.name}}_res.begin(), [](const Tensor& t) {
+  std::vector<pir::Value> {{input.name}}_res_inner({{input.name}}.get().size());
+  std::transform({{input.name}}.get().begin(), {{input.name}}.get().end(), {{input.name}}_res_inner.begin(), [](const Tensor& t) {
     return std::static_pointer_cast<LazyTensor>(t.impl())->value();
   });
+  {{input.name}}_res = paddle::make_optional<std::vector<pir::Value>>({{input.name}}_res_inner);
 }
     {% elif input.typename=='Tensor' and not input.optional %}
 pir::Value {{input.name}}_res = std::static_pointer_cast<LazyTensor>({{input.name}}.impl())->value();
     {% else %}
-pir::Value {{input.name}}_res;
+paddle::optional<pir::Value> {{input.name}}_res;
 if({{input.name}}) {
-  {{input.name}}_res = std::static_pointer_cast<LazyTensor>({{input.name}}.get().impl())->value();
+  pir::Value {{input.name}}_res_inner;
+  {{input.name}}_res_inner = std::static_pointer_cast<LazyTensor>({{input.name}}.get().impl())->value();
+  {{input.name}}_res = paddle::make_optional<pir::Value>({{input.name}}_res_inner);
 }
     {% endif %}
   {% endfor %}
@@ -44,27 +48,57 @@ if({{input.name}}) {
 
 {%- macro get_static_backend_outputs(outputs)-%}
   {%- if outputs|length == 1 -%}
-    {%- if outputs[0].typename == 'Tensor' -%}
+    {%- if outputs[0].typename == 'Tensor' and not outputs[0].optional-%}
 Tensor {{outputs[0].name}}(std::make_shared<LazyTensor>(op_res));
 return {{outputs[0].name}};
-    {%- elif outputs[0].typename == 'Tensor[]' -%}
+    {%- elif outputs[0].typename == 'Tensor' and outputs[0].optional -%}
+paddle::optional<Tensor> {{outputs[0].name}};
+if(op_res){
+  {{outputs[0].name}} = paddle::make_optional<Tensor>(Tensor(std::make_shared<LazyTensor>(op_res.get()));
+}
+return {{outputs[0].name}};
+    {%- elif outputs[0].typename == 'Tensor[]' and not outputs[0].optional -%}
 std::vector<Tensor> {{outputs[0].name}}(op_res.size());
 std::transform(op_res.begin(), op_res.end(), {{outputs[0].name}}.begin(), [](const pir::OpResult& res) {
 return Tensor(std::make_shared<LazyTensor>(res));
   });
+return {{outputs[0].name}};
+    {%- elif outputs[0].typename == 'Tensor[]' and outputs[0].optional -%}
+paddle::optional<std::vector<Tensor>> {{outputs[0].name}};
+if({{op_res}}) {
+  std::vector<pir::Value> {{outputs[0].name}}_inner(op_res.get().size());
+  std::transform(op_res.get().begin(), op_res.get().end(), {{outputs[0].name}}_inner.begin(), [](const pir::OpResult& res) {
+    return Tensor(std::make_shared<LazyTensor>(res));
+  });
+  {{outputs[0].name}} = paddle::make_optional<std::vector<Tensor>>({{outputs[0].name}}_inner);
+}
 return {{outputs[0].name}};
     {%- else -%} {#- render nothing -#}
     {%- endif -%}
   {%- elif outputs|length > 1 -%}
     {%- for i in range(outputs|length) %}
 auto op_res_{{i}} = std::get<{{i}}>(op_res);
-      {% if outputs[i].typename == 'Tensor' %}
+      {% if outputs[i].typename == 'Tensor' and not outputs[i].optional %}
 Tensor {{outputs[i].name}}(std::make_shared<LazyTensor>(op_res_{{i}}));
-      {% elif outputs[i].typename == 'Tensor[]' %}
+      {% elif outputs[i].typename == 'Tensor' and  outputs[i].optional %}
+paddle::optional<Tensor> {{outputs[i].name}};
+if(op_res_{{i}}){
+  {{outputs[i].name}} = paddle::make_optional<Tensor>(Tensor(std::make_shared<LazyTensor>(op_res_{{i}}.get())));
+}
+      {% elif outputs[i].typename == 'Tensor[]' and not outputs[i].optional %}
 std::vector<Tensor> {{outputs[i].name}}(op_res_{{i}}.size());
 std::transform(op_res_{{i}}.begin(), op_res_{{i}}.end(), {{outputs[i].name}}.begin(), [](const pir::OpResult& res) {
 return Tensor(std::make_shared<LazyTensor>(res));
   });
+      {% elif outputs[i].typename == 'Tensor[]' and outputs[i].optional %}
+paddle::optional<std::vector<Tensor>> {{outputs[i].name}};
+if(op_res_{{i}}){
+  std::vector<Tensor> {{outputs[i].name}}_inner(op_res_{{i}}.get().size());
+  std::transform(op_res_{{i}}.get().begin(), op_res_{{i}}.get().end(), {{outputs[i].name}}_inner.begin(), [](const pir::OpResult& res) {
+    return Tensor(std::make_shared<LazyTensor>(res));
+  });
+  {{outputs[i].name}} = paddle::make_optional<std::vector<Tensor>>({{outputs[i].name}}_inner);
+}
       {% else %} {#- render nothing -#}
       {% endif %}
     {% endfor -%}
@@ -107,14 +141,20 @@ auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}}
 {% for api in apis %}
 {% if api.name in backend_white_list %}
   {% set api_outputs = api.outputs | trip_intermediate %}
-{{sig(api.name, api.inputs, api_outputs, api.attrs)}} {
+  {% set inplace_map = {} %}
+  {% if 'inplace' in api and api.inplace != None %}
+    {% for source, target in api.inplace.items() %}
+      {% do inplace_map.update({source: target}) %}
+    {% endfor %}
+  {% endif %}
+{{sig(api.name, api.inputs, api_outputs, api.attrs, inplace_map)}} {
   {% filter indent(2, True) %}
 {{body(api.name, api.inputs, api_outputs, api.attrs)}} 
   {% endfilter %}
 }
 
   {% if api.attrs is exist_mutable_attribute %}
-{{sig(api.name, api.inputs, api_outputs, api.attrs, True)}} {
+{{sig(api.name, api.inputs, api_outputs, api.attrs, inplace_map, True)}} {
   {% filter indent(2, True) %}
 {{body(api.name, api.inputs, api_outputs, api.attrs, True)}} 
   {% endfilter %}
diff --git a/paddle/fluid/primitive/codegen/templates/common.j2 b/paddle/fluid/primitive/codegen/templates/common.j2
index fd49c76d88db0..6ac639e8ceeae 100644
--- a/paddle/fluid/primitive/codegen/templates/common.j2
+++ b/paddle/fluid/primitive/codegen/templates/common.j2
@@ -1,6 +1,6 @@
-{%- macro sig(name, inputs, outputs, attrs, mutable_attribute_as_inputs=False, default=False) -%}
+{%- macro sig(name, inputs, outputs, attrs, inplace_map, mutable_attribute_as_inputs=False, default=False) -%}
 template <typename T>
-{{ret(outputs)}} {{name}}({{params(inputs, attrs, mutable_attribute_as_inputs, default)}})
+{{ret(outputs, inplace_map)}} {{name}}({{params(inputs, attrs, mutable_attribute_as_inputs, default)}})
 {%- endmacro %}
 
 
@@ -40,9 +40,9 @@ template <typename T>
 {%- endmacro -%}
 
 
-{%- macro ret(outputs) -%}
+{%- macro ret(outputs, inplace_map) -%}
   {%- set names = [] -%}
-  {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type) -%} {%- endfor -%}
+  {%- for i in outputs -%} {%- do names.append(i.typename|to_paddle_output_type(i.name in inplace_map and i.optional)) -%} {%- endfor -%}
   {%- if names|length > 1 -%} 
 std::tuple<{{sequence('', '', ', ', names)}}>
   {%- else -%}
diff --git a/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2 b/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
index 34ee37c5898e6..5cf6807470f2b 100644
--- a/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
+++ b/paddle/fluid/primitive/codegen/templates/primitive/primitive.h.j2
@@ -18,7 +18,13 @@ using IntArray = paddle::experimental::IntArray;
   {%- for i in api.inputs -%} {%- do input_names.append(i.name) -%} {%- endfor -%}
   {%- set attr_names = [] -%}
   {%- for i in api.attrs -%} {%- do attr_names.append(i.name) -%} {% endfor %}
-{{common.sig(api.name, api.inputs, api.outputs | trip_intermediate, api.attrs, False, True)}} {
+  {% set inplace_map = {} %}
+  {% if 'inplace' in api and api.inplace != None %}
+    {% for source, target in api.inplace.items() %}
+      {% do inplace_map.update({source: target}) %}
+    {% endfor %}
+  {% endif %}
+{{common.sig(api.name, api.inputs, api.outputs | trip_intermediate, api.attrs, inplace_map, False, True)}} {
     return backend::{{api.name}}<T>({{common.args(input_names, attr_names)}});
 }
 

From a162d7beffcecfa4d3dc0ba7699ff843affe69db Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Mon, 25 Sep 2023 22:07:37 +0800
Subject: [PATCH 114/115] [PIR] No.43 Migrate paddle.where into pir (#57667)

---
 .../dialect/op_generator/vjp_interface_gen_op_list.py |  2 ++
 paddle/fluid/primitive/codegen/gen.py                 |  2 ++
 python/paddle/tensor/search.py                        |  3 ++-
 test/legacy_test/test_where_op.py                     | 11 ++++++++---
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
index c70a87e826b77..a42ee2d7ba14a 100644
--- a/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
@@ -24,6 +24,7 @@
 
 
 vjp_interface_declare_gen_op_list = [
+    'where',
     "tanh",
     "mean",
     "divide",
@@ -66,6 +67,7 @@
     'triu',
 ]
 vjp_interface_implementation_gen_op_list = [
+    'where',
     "tanh",
     "mean",
     "divide",
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 0865e01cdca17..0eef8470521ea 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -38,6 +38,7 @@
 
 
 VJPS = [
+    'where_grad',
     'tril_grad',
     'triu_grad',
     'tanh_grad',
@@ -152,6 +153,7 @@
 VJP_COMPS = PRIM_VJP + CUSTOM_VJP
 
 BACKENDS = [
+    'where_grad',
     'tril_grad',
     'triu_grad',
     'add_n',
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 5a90657016067..7a9a0981cebb0 100755
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -27,6 +27,7 @@
     convert_np_dtype_to_dtype_,
     core,
     in_dynamic_mode,
+    in_dynamic_or_pir_mode,
 )
 
 # from ..base.layers import has_inf  #DEFINE_ALIAS
@@ -686,7 +687,7 @@ def where(condition, x=None, y=None, name=None):
         broadcast_condition = paddle.add(cast_cond, broadcast_zeros)
         broadcast_condition = paddle.cast(broadcast_condition, 'bool')
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y)
     else:
         check_variable_and_dtype(condition, 'condition', ['bool'], 'where')
diff --git a/test/legacy_test/test_where_op.py b/test/legacy_test/test_where_op.py
index 2c508ee0c556f..3685a59b98134 100644
--- a/test/legacy_test/test_where_op.py
+++ b/test/legacy_test/test_where_op.py
@@ -33,10 +33,12 @@ def setUp(self):
         self.outputs = {'Out': np.where(self.cond, self.x, self.y)}
 
     def test_check_output(self):
-        self.check_output(check_cinn=self.check_cinn)
+        self.check_output(check_cinn=self.check_cinn, check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out', check_cinn=self.check_cinn)
+        self.check_grad(
+            ['X', 'Y'], 'Out', check_cinn=self.check_cinn, check_new_ir=True
+        )
 
     def init_config(self):
         self.x = np.random.uniform((-3), 5, 100).astype('float64')
@@ -82,7 +84,9 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_cinn=self.check_cinn)
+        self.check_output_with_place(
+            place, check_cinn=self.check_cinn, check_new_ir=True
+        )
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
@@ -92,6 +96,7 @@ def test_check_grad(self):
             'Out',
             numeric_grad_delta=0.05,
             check_cinn=self.check_cinn,
+            check_new_ir=True,
         )
 
     def init_config(self):

From a6f1fbfd510d2760025cddfdd31b0a34f455a781 Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Tue, 26 Sep 2023 00:02:35 +0800
Subject: [PATCH 115/115] [CodeStyle] format `python/paddle/__init__.py`
 (#57682)

---
 python/paddle/__init__.py | 916 ++++++++++++++++++++------------------
 1 file changed, 481 insertions(+), 435 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f0800fbaf44c6..e73b9ae0cc309 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 try:
-    from paddle.version import full_version as __version__
-    from paddle.version import commit as __git_commit__
+    from paddle.version import full_version as __version__  # noqa: F401
+    from paddle.version import commit as __git_commit__  # noqa: F401
     from paddle.cuda_env import *  # noqa: F403
 except ImportError:
     import sys
@@ -23,7 +23,7 @@
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
     )
 
-from .batch import batch  # noqa: F401
+from .batch import batch
 
 # Do the *DUPLICATED* monkey-patch for the tensor object.
 # We need remove the duplicated code here once we fix
@@ -34,443 +34,489 @@
 monkey_patch_variable()
 monkey_patch_math_tensor()
 
-from .framework import disable_signal_handler  # noqa: F401
-from .framework import get_flags  # noqa: F401
-from .framework import set_flags  # noqa: F401
-
-from .framework import disable_static  # noqa: F401
-from .framework import enable_static  # noqa: F401
-from .framework import in_dynamic_mode  # noqa: F401
-from .base.dataset import *  # noqa: F401, F403
-
-from .framework.dtype import iinfo  # noqa: F401
-from .framework.dtype import finfo  # noqa: F401
-from .framework.dtype import dtype  # noqa: F401
-from .framework.dtype import uint8  # noqa: F401
-from .framework.dtype import int8  # noqa: F401
-from .framework.dtype import int16  # noqa: F401
-from .framework.dtype import int32  # noqa: F401
-from .framework.dtype import int64  # noqa: F401
-from .framework.dtype import float16  # noqa: F401
-from .framework.dtype import float32  # noqa: F401
-from .framework.dtype import float64  # noqa: F401
-from .framework.dtype import bfloat16  # noqa: F401
-from .framework.dtype import bool  # noqa: F401
-from .framework.dtype import complex64  # noqa: F401
-from .framework.dtype import complex128  # noqa: F401
-
-Tensor = framework.core.eager.Tensor  # noqa: F401
-Tensor.__qualname__ = 'Tensor'  # noqa: F401
-import paddle.distributed  # noqa: F401
-import paddle.sysconfig  # noqa: F401
-import paddle.distribution  # noqa: F401
-import paddle.nn  # noqa: F401
+from .framework import (
+    disable_signal_handler,
+    get_flags,
+    set_flags,
+    disable_static,
+    enable_static,
+    in_dynamic_mode,
+)
+from .base.dataset import *  # noqa: F403
+
+from .framework.dtype import (
+    iinfo,
+    finfo,
+    dtype,
+    uint8,
+    int8,
+    int16,
+    int32,
+    int64,
+    float16,
+    float32,
+    float64,
+    bfloat16,
+    bool,
+    complex64,
+    complex128,
+)
+
+Tensor = framework.core.eager.Tensor
+Tensor.__qualname__ = 'Tensor'
+
 import paddle.distributed.fleet  # noqa: F401
-import paddle.optimizer  # noqa: F401
-import paddle.metric  # noqa: F401
-import paddle.regularizer  # noqa: F401
-import paddle.incubate  # noqa: F401
-import paddle.autograd  # noqa: F401
-import paddle.device  # noqa: F401
-import paddle.decomposition  # noqa: F401
-
-import paddle.jit  # noqa: F401
-import paddle.amp  # noqa: F401
-import paddle.dataset  # noqa: F401
-import paddle.inference  # noqa: F401
-import paddle.io  # noqa: F401
-import paddle.onnx  # noqa: F401
-import paddle.reader  # noqa: F401
-import paddle.static  # noqa: F401
-import paddle.vision  # noqa: F401
-import paddle.audio  # noqa: F401
-import paddle.geometric  # noqa: F401
-import paddle.sparse  # noqa: F401
-import paddle.quantization  # noqa: F401
-
-from .tensor.attribute import is_complex  # noqa: F401
-from .tensor.attribute import is_integer  # noqa: F401
-from .tensor.attribute import rank  # noqa: F401
-from .tensor.attribute import shape  # noqa: F401
-from .tensor.attribute import real  # noqa: F401
-from .tensor.attribute import imag  # noqa: F401
-from .tensor.attribute import is_floating_point  # noqa: F401
-from .tensor.creation import create_parameter  # noqa: F401
-from .tensor.creation import to_tensor  # noqa: F401
-from .tensor.creation import diag  # noqa: F401
-from .tensor.creation import diagflat  # noqa: F401
-from .tensor.creation import eye  # noqa: F401
-from .tensor.creation import linspace  # noqa: F401
-from .tensor.creation import logspace  # noqa: F401
-from .tensor.creation import ones  # noqa: F401
-from .tensor.creation import ones_like  # noqa: F401
-from .tensor.creation import zeros  # noqa: F401
-from .tensor.creation import zeros_like  # noqa: F401
-from .tensor.creation import arange  # noqa: F401
-from .tensor.creation import full  # noqa: F401
-from .tensor.creation import full_like  # noqa: F401
-from .tensor.creation import triu  # noqa: F401
-from .tensor.creation import triu_  # noqa: F401
-from .tensor.creation import tril  # noqa: F401
-from .tensor.creation import tril_  # noqa: F401
-from .tensor.creation import meshgrid  # noqa: F401
-from .tensor.creation import empty  # noqa: F401
-from .tensor.creation import empty_like  # noqa: F401
-from .tensor.creation import assign  # noqa: F401
-from .tensor.creation import complex  # noqa: F401
-from .tensor.creation import clone  # noqa: F401
-from .tensor.creation import tril_indices  # noqa: F401
-from .tensor.creation import triu_indices  # noqa: F401
-from .tensor.creation import polar  # noqa: F401
-from .tensor.creation import geometric_  # noqa: F401
-from .tensor.creation import cauchy_  # noqa: F401
-from .tensor.linalg import matmul  # noqa: F401
-from .tensor.linalg import dot  # noqa: F401
-from .tensor.linalg import norm  # noqa: F401
-from .tensor.linalg import transpose  # noqa: F401
-from .tensor.linalg import transpose_  # noqa: F401
-from .tensor.linalg import dist  # noqa: F401
-from .tensor.linalg import t  # noqa: F401
-from .tensor.linalg import t_  # noqa: F401
-from .tensor.linalg import cdist  # noqa: F401
-from .tensor.linalg import cross  # noqa: F401
-from .tensor.linalg import cholesky  # noqa: F401
-from .tensor.linalg import bmm  # noqa: F401
-from .tensor.linalg import histogram  # noqa: F401
-from .tensor.linalg import bincount  # noqa: F401
-from .tensor.linalg import mv  # noqa: F401
-from .tensor.logic import equal  # noqa: F401
-from .tensor.logic import equal_  # noqa: F401
-from .tensor.linalg import eigvalsh  # noqa: F401
-from .tensor.logic import greater_equal  # noqa: F401
-from .tensor.logic import greater_equal_  # noqa: F401
-from .tensor.logic import greater_than  # noqa: F401
-from .tensor.logic import greater_than_  # noqa: F401
-from .tensor.logic import is_empty  # noqa: F401
-from .tensor.logic import less_equal  # noqa: F401
-from .tensor.logic import less_equal_  # noqa: F401
-from .tensor.logic import less_than  # noqa: F401
-from .tensor.logic import less_than_  # noqa: F401
-from .tensor.logic import logical_and  # noqa: F401
-from .tensor.logic import logical_and_  # noqa: F401
-from .tensor.logic import logical_not  # noqa: F401
-from .tensor.logic import logical_not_  # noqa: F401
-from .tensor.logic import logical_or  # noqa: F401
-from .tensor.logic import logical_or_  # noqa: F401
-from .tensor.logic import logical_xor  # noqa: F401
-from .tensor.logic import logical_xor_  # noqa: F401
-from .tensor.logic import bitwise_and  # noqa: F401
-from .tensor.logic import bitwise_and_  # noqa: F401
-from .tensor.logic import bitwise_not  # noqa: F401
-from .tensor.logic import bitwise_not_  # noqa: F401
-from .tensor.logic import bitwise_or  # noqa: F401
-from .tensor.logic import bitwise_or_  # noqa: F401
-from .tensor.logic import bitwise_xor  # noqa: F401
-from .tensor.logic import bitwise_xor_  # noqa: F401
-from .tensor.logic import not_equal  # noqa: F401
-from .tensor.logic import not_equal_  # noqa: F401
-from .tensor.logic import allclose  # noqa: F401
-from .tensor.logic import isclose  # noqa: F401
-from .tensor.logic import equal_all  # noqa: F401
-from .tensor.logic import is_tensor  # noqa: F401
-from .tensor.manipulation import cast  # noqa: F401
-from .tensor.manipulation import cast_  # noqa: F401
-from .tensor.manipulation import concat  # noqa: F401
-from .tensor.manipulation import broadcast_tensors  # noqa: F401
-from .tensor.manipulation import expand  # noqa: F401
-from .tensor.manipulation import broadcast_to  # noqa: F401
-from .tensor.manipulation import expand_as  # noqa: F401
-from .tensor.manipulation import tile  # noqa: F401
-from .tensor.manipulation import flatten  # noqa: F401
-from .tensor.manipulation import gather  # noqa: F401
-from .tensor.manipulation import gather_nd  # noqa: F401
-from .tensor.manipulation import reshape  # noqa: F401
-from .tensor.manipulation import reshape_  # noqa: F401
-from .tensor.manipulation import flip as reverse  # noqa: F401
-from .tensor.manipulation import scatter  # noqa: F401
-from .tensor.manipulation import scatter_  # noqa: F401
-from .tensor.manipulation import scatter_nd_add  # noqa: F401
-from .tensor.manipulation import scatter_nd  # noqa: F401
-from .tensor.manipulation import shard_index  # noqa: F401
-from .tensor.manipulation import slice  # noqa: F401
-from .tensor.manipulation import crop  # noqa: F401
-from .tensor.manipulation import split  # noqa: F401
-from .tensor.manipulation import vsplit  # noqa: F401
-from .tensor.manipulation import squeeze  # noqa: F401
-from .tensor.manipulation import squeeze_  # noqa: F401
-from .tensor.manipulation import stack  # noqa: F401
-from .tensor.manipulation import strided_slice  # noqa: F401
-from .tensor.manipulation import unique  # noqa: F401
-from .tensor.manipulation import unique_consecutive  # noqa: F401
-from .tensor.manipulation import unsqueeze  # noqa: F401
-from .tensor.manipulation import unsqueeze_  # noqa: F401
-from .tensor.manipulation import unstack  # noqa: F401
-from .tensor.manipulation import flip  # noqa: F401
-from .tensor.manipulation import rot90  # noqa: F401
-from .tensor.manipulation import unbind  # noqa: F401
-from .tensor.manipulation import roll  # noqa: F401
-from .tensor.manipulation import chunk  # noqa: F401
-from .tensor.manipulation import tolist  # noqa: F401
-from .tensor.manipulation import take_along_axis  # noqa: F401
-from .tensor.manipulation import put_along_axis  # noqa: F401
-from .tensor.manipulation import tensordot  # noqa: F401
-from .tensor.manipulation import as_complex  # noqa: F401
-from .tensor.manipulation import as_real  # noqa: F401
-from .tensor.manipulation import moveaxis  # noqa: F401
-from .tensor.manipulation import repeat_interleave  # noqa: F401
-from .tensor.manipulation import index_add  # noqa: F401
-from .tensor.manipulation import index_add_  # noqa: F401
-from .tensor.manipulation import index_put  # noqa: F401
-from .tensor.manipulation import index_put_  # noqa: F401
-from .tensor.manipulation import unflatten  # noqa: F401
-from .tensor.manipulation import as_strided  # noqa: F401
-from .tensor.manipulation import view  # noqa: F401
-from .tensor.manipulation import view_as  # noqa: F401
-from .tensor.manipulation import unfold  # noqa: F401
-from .tensor.math import abs  # noqa: F401
-from .tensor.math import abs_  # noqa: F401
-from .tensor.math import acos  # noqa: F401
-from .tensor.math import acos_  # noqa: F401
-from .tensor.math import asin  # noqa: F401
-from .tensor.math import asin_  # noqa: F401
-from .tensor.math import atan  # noqa: F401
-from .tensor.math import atan_  # noqa: F401
-from .tensor.math import atan2  # noqa: F401
-from .tensor.math import ceil  # noqa: F401
-from .tensor.math import cos  # noqa: F401
-from .tensor.math import cos_  # noqa: F401
-from .tensor.math import tan  # noqa: F401
-from .tensor.math import tan_  # noqa: F401
-from .tensor.math import cosh  # noqa: F401
-from .tensor.math import cosh_  # noqa: F401
-from .tensor.math import cumsum  # noqa: F401
-from .tensor.math import cumsum_  # noqa: F401
-from .tensor.math import cummax  # noqa: F401
-from .tensor.math import cummin  # noqa: F401
-from .tensor.math import cumprod  # noqa: F401
-from .tensor.math import cumprod_  # noqa: F401
-from .tensor.math import logcumsumexp  # noqa: F401
-from .tensor.math import logit  # noqa: F401
-from .tensor.math import logit_  # noqa: F401
-from .tensor.math import exp  # noqa: F401
-from .tensor.math import expm1  # noqa: F401
-from .tensor.math import expm1_  # noqa: F401
-from .tensor.math import floor  # noqa: F401
-from .tensor.math import increment  # noqa: F401
-from .tensor.math import log  # noqa: F401
-from .tensor.math import log_  # noqa: F401
-from .tensor.math import log2_  # noqa: F401
-from .tensor.math import log2  # noqa: F401
-from .tensor.math import log10  # noqa: F401
-from .tensor.math import log10_  # noqa: F401
-from .tensor.math import multiplex  # noqa: F401
-from .tensor.math import pow  # noqa: F401
-from .tensor.math import pow_  # noqa: F401
-from .tensor.math import reciprocal  # noqa: F401
-from .tensor.math import all  # noqa: F401
-from .tensor.math import any  # noqa: F401
-from .tensor.math import round  # noqa: F401
-from .tensor.math import rsqrt  # noqa: F401
-from .tensor.math import scale  # noqa: F401
-from .tensor.math import sign  # noqa: F401
-from .tensor.math import sin  # noqa: F401
-from .tensor.math import sin_  # noqa: F401
-from .tensor.math import sinh  # noqa: F401
-from .tensor.math import sinh_  # noqa: F401
-from .tensor.math import sqrt  # noqa: F401
-from .tensor.math import square  # noqa: F401
-from .tensor.math import square_  # noqa: F401
-from .tensor.math import stanh  # noqa: F401
-from .tensor.math import sum  # noqa: F401
-from .tensor.math import nan_to_num  # noqa: F401
-from .tensor.math import nan_to_num_  # noqa: F401
-from .tensor.math import nansum  # noqa: F401
-from .tensor.math import nanmean  # noqa: F401
-from .tensor.math import count_nonzero  # noqa: F401
-from .tensor.math import tanh  # noqa: F401
-from .tensor.math import tanh_  # noqa: F401
-from .tensor.math import add_n  # noqa: F401
-from .tensor.math import max  # noqa: F401
-from .tensor.math import maximum  # noqa: F401
-from .tensor.math import amax  # noqa: F401
-from .tensor.math import min  # noqa: F401
-from .tensor.math import minimum  # noqa: F401
-from .tensor.math import amin  # noqa: F401
-from .tensor.math import mm  # noqa: F401
-from .tensor.math import divide  # noqa: F401
-from .tensor.math import divide_  # noqa: F401
-from .tensor.math import floor_divide  # noqa: F401
-from .tensor.math import floor_divide_  # noqa: F401
-from .tensor.math import remainder  # noqa: F401
-from .tensor.math import remainder_  # noqa: F401
-from .tensor.math import mod  # noqa: F401
-from .tensor.math import mod_  # noqa: F401
-from .tensor.math import floor_mod  # noqa: F401
-from .tensor.math import floor_mod_  # noqa: F401
-from .tensor.math import multiply  # noqa: F401
-from .tensor.math import multiply_  # noqa: F401
-from .tensor.math import renorm  # noqa: F401
-from .tensor.math import renorm_  # noqa: F401
-from .tensor.math import add  # noqa: F401
-from .tensor.math import subtract  # noqa: F401
-from .tensor.math import logsumexp  # noqa: F401
-from .tensor.math import logaddexp  # noqa: F401
-from .tensor.math import inverse  # noqa: F401
-from .tensor.math import log1p  # noqa: F401
-from .tensor.math import log1p_  # noqa: F401
-from .tensor.math import erf  # noqa: F401
-from .tensor.math import erf_  # noqa: F401
-from .tensor.math import addmm  # noqa: F401
-from .tensor.math import addmm_  # noqa: F401
-from .tensor.math import clip  # noqa: F401
-from .tensor.math import trace  # noqa: F401
-from .tensor.math import diagonal  # noqa: F401
-from .tensor.math import kron  # noqa: F401
-from .tensor.math import isfinite  # noqa: F401
-from .tensor.math import isinf  # noqa: F401
-from .tensor.math import isnan  # noqa: F401
-from .tensor.math import prod  # noqa: F401
-from .tensor.math import broadcast_shape  # noqa: F401
-from .tensor.math import conj  # noqa: F401
-from .tensor.math import trunc  # noqa: F401
-from .tensor.math import trunc_  # noqa: F401
-from .tensor.math import digamma  # noqa: F401
-from .tensor.math import digamma_  # noqa: F401
-from .tensor.math import neg  # noqa: F401
-from .tensor.math import neg_  # noqa: F401
-from .tensor.math import lgamma  # noqa: F401
-from .tensor.math import lgamma_  # noqa: F401
-from .tensor.math import acosh  # noqa: F401
-from .tensor.math import acosh_  # noqa: F401
-from .tensor.math import asinh  # noqa: F401
-from .tensor.math import asinh_  # noqa: F401
-from .tensor.math import atanh  # noqa: F401
-from .tensor.math import atanh_  # noqa: F401
-from .tensor.math import lerp  # noqa: F401
-from .tensor.math import erfinv  # noqa: F401
-from .tensor.math import rad2deg  # noqa: F401
-from .tensor.math import deg2rad  # noqa: F401
-from .tensor.math import gcd  # noqa: F401
-from .tensor.math import gcd_  # noqa: F401
-from .tensor.math import lcm  # noqa: F401
-from .tensor.math import lcm_  # noqa: F401
-from .tensor.math import diff  # noqa: F401
-from .tensor.math import angle  # noqa: F401
-from .tensor.math import fmax  # noqa: F401
-from .tensor.math import fmin  # noqa: F401
-from .tensor.math import inner  # noqa: F401
-from .tensor.math import outer  # noqa: F401
-from .tensor.math import heaviside  # noqa: F401
-from .tensor.math import frac  # noqa: F401
-from .tensor.math import frac_  # noqa: F401
-from .tensor.math import sgn  # noqa: F401
-from .tensor.math import take  # noqa: F401
-from .tensor.math import frexp  # noqa: F401
-from .tensor.math import ldexp  # noqa: F401
-from .tensor.math import ldexp_  # noqa: F401
-from .tensor.math import trapezoid  # noqa: F401
-from .tensor.math import cumulative_trapezoid  # noqa: F401
-from .tensor.math import vander  # noqa: F401
-from .tensor.math import nextafter  # noqa: F401
-from .tensor.math import i0  # noqa: F401
-from .tensor.math import i0_  # noqa: F401
-from .tensor.math import i0e  # noqa: F401
-from .tensor.math import i1  # noqa: F401
-from .tensor.math import i1e  # noqa: F401
-from .tensor.math import polygamma  # noqa: F401
-from .tensor.math import polygamma_  # noqa: F401
-
-from .tensor.random import bernoulli  # noqa: F401
-from .tensor.random import poisson  # noqa: F401
-from .tensor.random import multinomial  # noqa: F401
-from .tensor.random import standard_normal  # noqa: F401
-from .tensor.random import normal  # noqa: F401
-from .tensor.random import normal_  # noqa: F401
-from .tensor.random import uniform  # noqa: F401
-from .tensor.random import randn  # noqa: F401
-from .tensor.random import rand  # noqa: F401
-from .tensor.random import randint  # noqa: F401
-from .tensor.random import randint_like  # noqa: F401
-from .tensor.random import randperm  # noqa: F401
-from .tensor.search import argmax  # noqa: F401
-from .tensor.search import argmin  # noqa: F401
-from .tensor.search import argsort  # noqa: F401
-from .tensor.search import searchsorted  # noqa: F401
-from .tensor.search import bucketize  # noqa: F401
-from .tensor.search import masked_select  # noqa: F401
-from .tensor.search import topk  # noqa: F401
-from .tensor.search import where  # noqa: F401
-from .tensor.search import where_  # noqa: F401
-from .tensor.search import index_select  # noqa: F401
-from .tensor.search import nonzero  # noqa: F401
-from .tensor.search import sort  # noqa: F401
-from .tensor.search import kthvalue  # noqa: F401
-from .tensor.search import mode  # noqa: F401
-
-from .tensor.to_string import set_printoptions  # noqa: F401
-
-from .tensor.einsum import einsum  # noqa: F401
-
-from .framework.random import seed  # noqa: F401
-from .framework.random import get_cuda_rng_state  # noqa: F401
-from .framework.random import set_cuda_rng_state  # noqa: F401
-from .framework.random import get_rng_state  # noqa: F401
-from .framework.random import set_rng_state  # noqa: F401
-from .framework import ParamAttr  # noqa: F401
-from .framework import CPUPlace  # noqa: F401
-from .framework import IPUPlace  # noqa: F401
-from .framework import CUDAPlace  # noqa: F401
-from .framework import CUDAPinnedPlace  # noqa: F401
-from .framework import CustomPlace  # noqa: F401
-from .framework import XPUPlace  # noqa: F401
-
-from .autograd import grad  # noqa: F401
-from .autograd import no_grad  # noqa: F401
-from .autograd import enable_grad  # noqa:F401
-from .autograd import set_grad_enabled  # noqa: F401
-from .autograd import is_grad_enabled  # noqa: F401
-from .framework import save  # noqa: F401
-from .framework import load  # noqa: F401
-from .distributed import DataParallel  # noqa: F401
-
-from .framework import set_default_dtype  # noqa: F401
-from .framework import get_default_dtype  # noqa: F401
-
-from .tensor.search import index_sample  # noqa: F401
-from .tensor.stat import mean  # noqa: F401
-from .tensor.stat import std  # noqa: F401
-from .tensor.stat import var  # noqa: F401
-from .tensor.stat import numel  # noqa: F401
-from .tensor.stat import median  # noqa: F401
-from .tensor.stat import nanmedian  # noqa: F401
-from .tensor.stat import quantile  # noqa: F401
-from .tensor.stat import nanquantile  # noqa: F401
-from .device import get_cudnn_version  # noqa: F401
-from .device import set_device  # noqa: F401
-from .device import get_device  # noqa: F401
-from .device import is_compiled_with_xpu  # noqa: F401
-from .device import is_compiled_with_ipu  # noqa: F401
-from .device import is_compiled_with_cinn  # noqa: F401
-from .device import is_compiled_with_cuda  # noqa: F401
-from .device import is_compiled_with_rocm  # noqa: F401
-from .device import is_compiled_with_custom_device  # noqa: F401
+
+from paddle import (  # noqa: F401
+    distributed,
+    sysconfig,
+    distribution,
+    nn,
+    optimizer,
+    metric,
+    regularizer,
+    incubate,
+    autograd,
+    device,
+    decomposition,
+    jit,
+    amp,
+    dataset,
+    inference,
+    io,
+    onnx,
+    reader,
+    static,
+    vision,
+    audio,
+    geometric,
+    sparse,
+    quantization,
+)
+
+from .tensor.attribute import (
+    is_complex,
+    is_integer,
+    rank,
+    shape,
+    real,
+    imag,
+    is_floating_point,
+)
+
+from .tensor.creation import (
+    create_parameter,
+    to_tensor,
+    diag,
+    diagflat,
+    eye,
+    linspace,
+    logspace,
+    ones,
+    ones_like,
+    zeros,
+    zeros_like,
+    arange,
+    full,
+    full_like,
+    triu,
+    triu_,
+    tril,
+    tril_,
+    meshgrid,
+    empty,
+    empty_like,
+    assign,
+    complex,
+    clone,
+    tril_indices,
+    triu_indices,
+    polar,
+    geometric_,
+    cauchy_,
+)
+
+from .tensor.linalg import (  # noqa: F401
+    matmul,
+    dot,
+    norm,
+    transpose,
+    transpose_,
+    dist,
+    t,
+    t_,
+    cdist,
+    cross,
+    cholesky,
+    bmm,
+    histogram,
+    bincount,
+    mv,
+    eigvalsh,
+)
+
+from .tensor.logic import (  # noqa: F401
+    equal,
+    equal_,
+    greater_equal,
+    greater_equal_,
+    greater_than,
+    greater_than_,
+    is_empty,
+    less_equal,
+    less_equal_,
+    less_than,
+    less_than_,
+    logical_and,
+    logical_and_,
+    logical_not,
+    logical_not_,
+    logical_or,
+    logical_or_,
+    logical_xor,
+    logical_xor_,
+    bitwise_and,
+    bitwise_and_,
+    bitwise_not,
+    bitwise_not_,
+    bitwise_or,
+    bitwise_or_,
+    bitwise_xor,
+    bitwise_xor_,
+    not_equal,
+    not_equal_,
+    allclose,
+    isclose,
+    equal_all,
+    is_tensor,
+)
+
+
+from .tensor.manipulation import (  # noqa: F401
+    cast,
+    cast_,
+    concat,
+    broadcast_tensors,
+    expand,
+    broadcast_to,
+    expand_as,
+    tile,
+    flatten,
+    gather,
+    gather_nd,
+    reshape,
+    reshape_,
+    flip as reverse,
+    scatter,
+    scatter_,
+    scatter_nd_add,
+    scatter_nd,
+    shard_index,
+    slice,
+    crop,
+    split,
+    vsplit,
+    squeeze,
+    squeeze_,
+    stack,
+    strided_slice,
+    unique,
+    unique_consecutive,
+    unsqueeze,
+    unsqueeze_,
+    unstack,
+    flip,
+    rot90,
+    unbind,
+    roll,
+    chunk,
+    tolist,
+    take_along_axis,
+    put_along_axis,
+    tensordot,
+    as_complex,
+    as_real,
+    moveaxis,
+    repeat_interleave,
+    index_add,
+    index_add_,
+    index_put,
+    index_put_,
+    unflatten,
+    as_strided,
+    view,
+    view_as,
+    unfold,
+)
+
+from .tensor.math import (  # noqa: F401
+    abs,
+    abs_,
+    acos,
+    acos_,
+    asin,
+    asin_,
+    atan,
+    atan_,
+    atan2,
+    ceil,
+    cos,
+    cos_,
+    tan,
+    tan_,
+    cosh,
+    cosh_,
+    cumsum,
+    cumsum_,
+    cummax,
+    cummin,
+    cumprod,
+    cumprod_,
+    logcumsumexp,
+    logit,
+    logit_,
+    exp,
+    expm1,
+    expm1_,
+    floor,
+    increment,
+    log,
+    log_,
+    log2_,
+    log2,
+    log10,
+    log10_,
+    multiplex,
+    pow,
+    pow_,
+    reciprocal,
+    all,
+    any,
+    round,
+    rsqrt,
+    scale,
+    sign,
+    sin,
+    sin_,
+    sinh,
+    sinh_,
+    sqrt,
+    square,
+    square_,
+    stanh,
+    sum,
+    nan_to_num,
+    nan_to_num_,
+    nansum,
+    nanmean,
+    count_nonzero,
+    tanh,
+    tanh_,
+    add_n,
+    max,
+    maximum,
+    amax,
+    min,
+    minimum,
+    amin,
+    mm,
+    divide,
+    divide_,
+    floor_divide,
+    floor_divide_,
+    remainder,
+    remainder_,
+    mod,
+    mod_,
+    floor_mod,
+    floor_mod_,
+    multiply,
+    multiply_,
+    renorm,
+    renorm_,
+    add,
+    subtract,
+    logsumexp,
+    logaddexp,
+    inverse,
+    log1p,
+    log1p_,
+    erf,
+    erf_,
+    addmm,
+    addmm_,
+    clip,
+    trace,
+    diagonal,
+    kron,
+    isfinite,
+    isinf,
+    isnan,
+    prod,
+    broadcast_shape,
+    conj,
+    trunc,
+    trunc_,
+    digamma,
+    digamma_,
+    neg,
+    neg_,
+    lgamma,
+    lgamma_,
+    acosh,
+    acosh_,
+    asinh,
+    asinh_,
+    atanh,
+    atanh_,
+    lerp,
+    erfinv,
+    rad2deg,
+    deg2rad,
+    gcd,
+    gcd_,
+    lcm,
+    lcm_,
+    diff,
+    angle,
+    fmax,
+    fmin,
+    inner,
+    outer,
+    heaviside,
+    frac,
+    frac_,
+    sgn,
+    take,
+    frexp,
+    ldexp,
+    ldexp_,
+    trapezoid,
+    cumulative_trapezoid,
+    vander,
+    nextafter,
+    i0,
+    i0_,
+    i0e,
+    i1,
+    i1e,
+    polygamma,
+    polygamma_,
+)
+
+from .tensor.random import (
+    bernoulli,
+    poisson,
+    multinomial,
+    standard_normal,
+    normal,
+    normal_,
+    uniform,
+    randn,
+    rand,
+    randint,
+    randint_like,
+    randperm,
+)
+from .tensor.search import (
+    argmax,
+    argmin,
+    argsort,
+    searchsorted,
+    bucketize,
+    masked_select,
+    topk,
+    where,
+    where_,
+    index_select,
+    nonzero,
+    sort,
+    kthvalue,
+    mode,
+)
+
+from .tensor.to_string import set_printoptions
+
+from .tensor.einsum import einsum
+
+from .framework.random import (
+    seed,
+    get_cuda_rng_state,
+    set_cuda_rng_state,
+    get_rng_state,
+    set_rng_state,
+)
+from .framework import (  # noqa: F401
+    ParamAttr,
+    CPUPlace,
+    IPUPlace,
+    CUDAPlace,
+    CUDAPinnedPlace,
+    CustomPlace,
+    XPUPlace,
+)
+
+from .autograd import (
+    grad,
+    no_grad,
+    enable_grad,
+    set_grad_enabled,
+    is_grad_enabled,
+)
+from .framework import (
+    save,
+    load,
+)
+from .distributed import DataParallel
+
+from .framework import (
+    set_default_dtype,
+    get_default_dtype,
+)
+
+from .tensor.search import index_sample
+from .tensor.stat import (
+    mean,
+    std,
+    var,
+    numel,
+    median,
+    nanmedian,
+    quantile,
+    nanquantile,
+)
+from .device import (  # noqa: F401
+    get_cudnn_version,
+    set_device,
+    get_device,
+    is_compiled_with_xpu,
+    is_compiled_with_ipu,
+    is_compiled_with_cinn,
+    is_compiled_with_cuda,
+    is_compiled_with_rocm,
+    is_compiled_with_custom_device,
+)
 
 # high-level api
-from .hapi import Model  # noqa: F401
-from . import callbacks  # noqa: F401
-from .hapi import summary  # noqa: F401
-from .hapi import flops  # noqa: F401
-from . import hub  # noqa: F401
-from . import linalg  # noqa: F401
-from . import fft  # noqa: F401
-from . import signal  # noqa: F401
-from . import _pir_ops  # noqa: F401
+from . import (  # noqa: F401
+    callbacks,
+    hub,
+    linalg,
+    fft,
+    signal,
+    _pir_ops,
+)
+from .hapi import (
+    Model,
+    summary,
+    flops,
+)
 
 import paddle.text  # noqa: F401
 import paddle.vision  # noqa: F401
 
-from .tensor.random import check_shape  # noqa: F401
-from .nn.initializer.lazy_init import LazyGuard  # noqa: F401
+from .tensor.random import check_shape
+from .nn.initializer.lazy_init import LazyGuard
 
 # CINN has to set a flag to include a lib
 if is_compiled_with_cinn():
@@ -484,12 +530,12 @@
 
 disable_static()
 
-from .pir_utils import IrGuard  # noqa: F401
+from .pir_utils import IrGuard
 
 ir_change = IrGuard()
 ir_change._switch_to_pir()
 
-__all__ = [  # noqa
+__all__ = [
     'iinfo',
     'finfo',
     'dtype',