From 37ed407c25e8cac67a158340826d3bf1530946bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Thu, 22 Feb 2024 19:37:23 +0800
Subject: [PATCH 01/82] [paddle inference] make cutlass_conv2d compiled as a so
 independent of phi (#61551)

* add workspcae
* fix a bug in depthwise
* add cmake args and add sys path
* remove paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu
* remove unitest
* remove somehting in paddle/phi/kernels/CMakeLists.txt
* cmake release
---
 paddle/phi/backends/dynload/CMakeLists.txt    |   3 +-
 paddle/phi/backends/dynload/cutlass_conv2d.cc |  57 +++++
 paddle/phi/backends/dynload/cutlass_conv2d.h  |  30 +++
 paddle/phi/kernels/CMakeLists.txt             |  32 ---
 .../fusion/cutlass/conv2d/CMakeLists.txt      |  41 ++++
 .../kernels/fusion/cutlass/conv2d/README.md   |  25 ++
 .../kernels/fusion/cutlass/conv2d/compile.sh  |  33 +++
 .../fusion/cutlass/conv2d/conv2d_bias_act.py  |   3 -
 .../conv2d/conv2d_bias_relu_few_channels.cu   | 218 ------------------
 .../cutlass/conv2d/conv2d_bias_residual.py    |   3 -
 .../fusion/cutlass/conv2d/conv2d_common.py    |  18 +-
 .../fusion/cutlass/conv2d/conv2d_decl.h       |  29 +--
 .../conv2d/conv2d_depthwise_bias_act.py       |  11 +-
 .../fusion/cutlass/conv2d/conv2d_util.cu      |  31 +--
 .../fusion/cutlass/conv2d/conv2d_util.h       |  17 +-
 .../cutlass/fused_conv2d_add_act_kernel.cu    |  44 +++-
 test/ir/inference/CMakeLists.txt              |  13 --
 17 files changed, 267 insertions(+), 341 deletions(-)
 create mode 100644 paddle/phi/backends/dynload/cutlass_conv2d.cc
 create mode 100644 paddle/phi/backends/dynload/cutlass_conv2d.h
 create mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
 create mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/README.md
 create mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
 delete mode 100644 paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 2db75d7022f0a..9fd293574e247 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -13,7 +13,8 @@ list(
   cusolver.cc
   cusparse.cc
   nvtx.cc
-  cufft.cc)
+  cufft.cc
+  cutlass_conv2d.cc)
 
 if(NOT WITH_NV_JETSON)
   list(APPEND CUDA_SRCS nvjpeg.cc)
diff --git a/paddle/phi/backends/dynload/cutlass_conv2d.cc b/paddle/phi/backends/dynload/cutlass_conv2d.cc
new file mode 100644
index 0000000000000..936a04fa3023c
--- /dev/null
+++ b/paddle/phi/backends/dynload/cutlass_conv2d.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/phi/backends/dynload/cutlass_conv2d.h"
+#include <string>
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag cutlass_dso_flag;
+void* cutlass_dso_handle;
+
+void* GetCutlassConv2dHandle() {
+  std::string dso_name = "libCutlassConv2d.so";
+
+  std::call_once(cutlass_dso_flag, [&]() {
+#if !defined(_WIN32)
+    int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+#else
+  int dynload_flags = 0;
+#endif  // !_WIN32
+
+    cutlass_dso_handle = dlopen(dso_name.c_str(), dynload_flags);
+
+    PADDLE_ENFORCE_NOT_NULL(
+        cutlass_dso_handle,
+        phi::errors::NotFound(
+            "libCutlassConv2d.so is needed, "
+            "but libCutlassConv2d.so is not found.\n"
+            "  Suggestions:\n"
+            "  1. Refer paddle/phi/kernels/fusion/cutlass/conv2d/README.md, "
+            "and compile this library.\n"
+            "  2. Configure environment variables as "
+            "follows:\n"
+            "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
+            "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
+            "  - Mac: set  DYLD_LIBRARY_PATH by `export "
+            "DYLD_LIBRARY_PATH=...`\n"));
+  });
+
+  return cutlass_dso_handle;
+}
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/cutlass_conv2d.h b/paddle/phi/backends/dynload/cutlass_conv2d.h
new file mode 100644
index 0000000000000..c342eb9d09220
--- /dev/null
+++ b/paddle/phi/backends/dynload/cutlass_conv2d.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#if !defined(_WIN32)
+#include <dlfcn.h>
+#endif
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+
+namespace phi {
+namespace dynload {
+
+void* GetCutlassConv2dHandle();
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 85697df11bc56..80d61ebc9a9a6 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -54,36 +54,6 @@ if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
 endif()
 
 if(WITH_CUTLASS)
-  execute_process(
-    COMMAND ${CMAKE_COMMAND} -E make_directory
-            "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d/generated_tmp"
-    COMMAND ${PYTHON_EXECUTABLE} "conv2d_bias_act.py"
-    COMMAND ${PYTHON_EXECUTABLE} "conv2d_bias_residual.py"
-    COMMAND ${PYTHON_EXECUTABLE} "conv2d_depthwise_bias_act.py"
-    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/conv2d")
-  set(generated_tmp_dir
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/conv2d/generated_tmp
-  )
-  set(generated_dir
-      ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/conv2d/generated)
-  file(GLOB con2d_generated_files ${generated_tmp_dir}/*.cu)
-
-  if(EXISTS ${generated_dir})
-    foreach(gen_file ${con2d_generated_files})
-      string(REPLACE "generated_tmp" "generated" now_file ${gen_file})
-      execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                              "${gen_file}" "${now_file}")
-    endforeach()
-    message("copy if different ${generated_dir}")
-  else()
-    foreach(gen_file ${con2d_generated_files})
-      string(REPLACE "generated_tmp" "generated" now_file ${gen_file})
-      execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${gen_file}"
-                              "${now_file}")
-    endforeach()
-    message("copy ${generated_dir}")
-  endif()
-
   execute_process(
     COMMAND
       ${PYTHON_EXECUTABLE}
@@ -204,8 +174,6 @@ if(WITH_CUTLASS)
   file(
     GLOB cutlass_cu
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-    "fusion/cutlass/conv2d/generated/*.cu"
-    "fusion/cutlass/conv2d/*.cu"
     "fusion/cutlass/*.cu"
     "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu"
     "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
new file mode 100644
index 0000000000000..cd82bbf1dc8b7
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.23)
+
+if(NOT DEFINED PYTHON_EXECUTABLE)
+  message(
+    FATAL_ERROR
+      "please set PYTHON_EXECUTABLE with -DPYTHON_EXECUTABLE=python executable path"
+  )
+endif()
+
+if(NOT DEFINED COMPUTE_CAPABILITY)
+  message(
+    FATAL_ERROR
+      "please set COMPUTE_CAPABILITY with -DCOMPUTE_CAPABILITY=your gpu compute capability"
+  )
+endif()
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/cutlass/include")
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../../../../../")
+
+execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory
+                        "${CMAKE_CURRENT_BINARY_DIR}/generated_tmp")
+
+execute_process(
+  COMMAND ${PYTHON_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/conv2d_bias_act.py"
+  COMMAND ${PYTHON_EXECUTABLE}
+          "${CMAKE_CURRENT_SOURCE_DIR}/conv2d_bias_residual.py"
+  COMMAND ${PYTHON_EXECUTABLE}
+          "${CMAKE_CURRENT_SOURCE_DIR}/conv2d_depthwise_bias_act.py"
+  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+
+find_package(CUDA)
+
+set(CUDA_NVCC_FLAGS
+    -gencode arch=compute_${COMPUTE_CAPABILITY},code=sm_${COMPUTE_CAPABILITY};)
+#set(CMAKE_CXX_FLAGS -fvisibility=hidden)
+set(CMAKE_BUILD_TYPE "Release")
+file(GLOB all_cutlass_conv2d_cu
+     "${CMAKE_CURRENT_BINARY_DIR}/generated_tmp/*.cu")
+list(APPEND all_cutlass_conv2d_cu "${CMAKE_CURRENT_SOURCE_DIR}/conv2d_util.cu")
+
+cuda_add_library(CutlassConv2d SHARED ${all_cutlass_conv2d_cu})
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/README.md b/paddle/phi/kernels/fusion/cutlass/conv2d/README.md
new file mode 100644
index 0000000000000..a717b3d692b91
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/README.md
@@ -0,0 +1,25 @@
+# 如何编译和使用cutlass的conv2d算子
+
+本文件夹下面对应的是基于cutlass开发的conv2d算子，此算子被独立编译成so，供paddle内部的phi库调用。
+这样做的好处有两个：
+1. 减少paddle 发版时包的体积，避免把cutlass的代码打包进paddle inference中。
+2. 将框架和算子具体实现完全解耦开，保证paddle框架的通用性的同时，保证具体算子实现的灵活性。
+
+用户可以采用如下步骤编译和使用此算子
+
+step1.
+
+`bash compile.sh`
+
+注意，该脚本中有三个参数需要用户自己指定下，分别是python解释器的路径，cuda的根目录路径和用户GPU机器的计算能力。
+```shell
+python_exe_path="python"
+cuda_root_path="/usr/local/cuda"
+gpu_cc="75"
+```
+compile.sh 脚本中会下载cutlass，执行CMakeLists.txt脚本，编译生成动态库。
+
+
+step2.
+
+step1执行后，就可以看到在 build 目录生成了 `libCutlassConv2d.so` ，并将build目录添加到LD_LIBRARY_PATH中即可使用此库。
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
new file mode 100644
index 0000000000000..44c0fdf3a04da
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/compile.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cutlass_repo_directory="cutlass"
+if [ ! -d "$cutlass_repo_directory" ]; then
+    git clone --branch v2.11.0  https://github.com/NVIDIA/cutlass
+fi
+
+build_directory="build"
+if [ ! -d "$build_directory" ]; then
+    mkdir $build_directory
+fi
+
+python_exe_path="python"
+cuda_root_path="/usr/local/cuda"
+gpu_cc="75"
+
+cd $build_directory
+cmake .. -DPYTHON_EXECUTABLE=$python_exe_path -DCUDA_TOOLKIT_ROOT_DIR=$cuda_root_path -DCOMPUTE_CAPABILITY=$gpu_cc
+make -j 
+cd -
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
index 6870d191a8026..0cb925489f14a 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_act.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("../")
 import enum
 
 from conv2d_common import (
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu
deleted file mode 100644
index fb1c3f2313c98..0000000000000
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu
+++ /dev/null
@@ -1,218 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <mutex>
-#include "cutlass/conv/kernel/default_conv2d_fprop.h"
-#include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
-
-namespace phi {
-namespace fusion {
-namespace cutlass_internal {
-template <typename TShape, typename WShape, int Alignment = 1>
-cutlass::Status Conv2dBiasReluFewChannelsImpl(ConvAllParams params) {
-  using ElementAccumulator = float;
-  using ElementComputeEpilogue = float;
-  using ElementInputA = cutlass::half_t;
-  using ElementInputB = cutlass::half_t;
-  using ElementOutput = cutlass::half_t;
-  using LayoutInputA = cutlass::layout::TensorNHWC;
-  using LayoutInputB = cutlass::layout::TensorNHWC;
-  using LayoutOutput = cutlass::layout::TensorNHWC;
-  using MMAOp = cutlass::arch::OpClassTensorOp;
-  using SmArch = cutlass::arch::Sm75;
-  using ThreadblockShape = TShape;
-  using WarpShape = WShape;
-  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
-  using SwizzleThreadBlock =
-      cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>;
-  constexpr int NumStages = 2;
-  static cutlass::conv::IteratorAlgorithm const IteratorAlgorithm =
-      cutlass::conv::IteratorAlgorithm::kFewChannels;
-  using EpilogueOp =
-      cutlass::epilogue::thread::LinearCombinationRelu<ElementOutput,
-                                                       Alignment,
-                                                       float,
-                                                       ElementComputeEpilogue>;
-
-  using Conv2dFpropKernel = typename cutlass::conv::kernel::DefaultConv2dFprop<
-      ElementInputA,
-      LayoutInputA,
-      ElementInputB,
-      LayoutInputB,
-      ElementOutput,
-      LayoutOutput,
-      ElementAccumulator,
-      MMAOp,
-      SmArch,
-      ThreadblockShape,
-      WarpShape,
-      InstructionShape,
-      EpilogueOp,
-      SwizzleThreadBlock,
-      NumStages,
-      cutlass::arch::OpMultiplyAdd,
-      IteratorAlgorithm,
-      cutlass::conv::StrideSupport::kStrided,
-      Alignment,
-      Alignment>::Kernel;
-  using ImplicitGemm =
-      cutlass::conv::device::ImplicitGemmConvolution<Conv2dFpropKernel>;
-
-  const half *input = params.input;
-  const half *weight = params.weight;
-  const half *bias = params.bias;
-  half *output = params.output;
-  int batch = params.batch;
-  int ic = params.ic;
-  int ih = params.ih;
-  int iw = params.iw;
-  int kh = params.kh;
-  int kw = params.kw;
-  int oc = params.oc;
-  int pad_h0 = params.pad_h0;
-  int pad_w0 = params.pad_w1;
-
-  int stride_h = params.stride_h;
-  int stride_w = params.stride_w;
-
-  int oh = params.oh;
-  int ow = params.ow;
-  int dilation_h = params.dilation_h;
-  int dilation_w = params.dilation_w;
-
-  cutlass::conv::Mode mode = cutlass::conv::Mode::kCrossCorrelation;
-  cutlass::conv::Conv2dProblemSize problem_size({batch, ih, iw, ic},
-                                                {oc, kh, kw, ic},
-                                                {pad_h0, 0, pad_w0, 0},
-                                                {stride_h, stride_w},
-                                                {dilation_h, dilation_w},
-                                                {batch, oh, ow, oc},
-                                                mode,
-                                                1);
-
-  typename ImplicitGemm::Arguments arguments{
-      problem_size,
-      {(cutlass::half_t *)(input), {ic, ic * iw, ic * iw * ih}},
-      {(cutlass::half_t *)(weight), {ic, ic * kw, ic * kw * kh}},
-      {(cutlass::half_t *)(bias), {0, 0, 0}},
-      {(cutlass::half_t *)(output), {oc, oc * ow, oc * ow * oh}},
-      {1.f, 1.f}};
-
-  ImplicitGemm implicit_gemm_op;
-  size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
-
-  auto ctx = params.ctx;
-  auto stream = ctx->stream();
-  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
-      ctx->GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-  void *workspace = tmp_gpu_ptrs_data->ptr();
-
-  cutlass::Status status = implicit_gemm_op.can_implement(arguments);
-  CUTLASS_CHECK(status);
-  status = implicit_gemm_op.initialize(arguments, workspace);
-  CUTLASS_CHECK(status);
-  status = implicit_gemm_op(stream);
-  CUTLASS_CHECK(status);
-  return status;
-}
-
-// config 0
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<64, 64, 64>,
-    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
-// config 1
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<64, 32, 64>,
-    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
-// config 2
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<128, 32, 64>,
-    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
-// config 3
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<128, 64, 64>,
-    cutlass::gemm::GemmShape<32, 32, 64>>(ConvAllParams);
-// config 4
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<64, 64, 32>,
-    cutlass::gemm::GemmShape<32, 32, 32>>(ConvAllParams);
-// config 5
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<64, 128, 32>,
-    cutlass::gemm::GemmShape<32, 64, 32>>(ConvAllParams);
-// config 6
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<64, 128, 64>,
-    cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
-// config 7
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<64, 256, 32>,
-    cutlass::gemm::GemmShape<64, 64, 32>>(ConvAllParams);
-// config 8
-template cutlass::Status Conv2dBiasReluFewChannelsImpl<
-    cutlass::gemm::GemmShape<128, 64, 32>,
-    cutlass::gemm::GemmShape<64, 32, 32>>(ConvAllParams);
-
-std::vector<std::function<cutlass::Status(ConvAllParams)>>
-    conv2d_bias_relu_few_channels_all_func = {
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 64, 64>,
-                                      cutlass::gemm::GemmShape<32, 32, 64>>,
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 32, 64>,
-                                      cutlass::gemm::GemmShape<32, 32, 64>>,
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<128, 32, 64>,
-                                      cutlass::gemm::GemmShape<32, 32, 64>>,
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<128, 64, 64>,
-                                      cutlass::gemm::GemmShape<32, 32, 64>>,
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 64, 32>,
-                                      cutlass::gemm::GemmShape<32, 32, 32>>,
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 128, 32>,
-                                      cutlass::gemm::GemmShape<32, 64, 32>>,
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 128, 64>,
-                                      cutlass::gemm::GemmShape<64, 64, 32>>,
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<64, 256, 32>,
-                                      cutlass::gemm::GemmShape<64, 64, 32>>,
-        Conv2dBiasReluFewChannelsImpl<cutlass::gemm::GemmShape<128, 64, 32>,
-                                      cutlass::gemm::GemmShape<64, 32, 32>>};
-std::map<std::vector<int>, int> map_problem_conv2d_bias_relu_few_channels;
-
-void Conv2dBiasReluFewChannels(ConvAllParams params) {
-  int batch = params.batch;
-  int ic = params.ic;
-  int ih = params.ih;
-  int iw = params.iw;
-  int kh = params.kh;
-  int kw = params.kw;
-  int oc = params.oc;
-  int pad_h0 = params.pad_h0;
-  int pad_w0 = params.pad_w1;
-  int stride_h = params.stride_h;
-  int stride_w = params.stride_w;
-
-  std::vector<int> problem_size = {
-      batch, ic, ih, iw, kh, kw, oc, pad_h0, pad_w0, stride_h, stride_w};
-
-  if (map_problem_conv2d_bias_relu_few_channels.count(problem_size)) {
-    conv2d_bias_relu_few_channels_all_func
-        [map_problem_conv2d_bias_relu_few_channels.at(problem_size)](params);
-    return;
-  }
-  //
-}
-}  // namespace cutlass_internal
-}  // namespace fusion
-}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
index 109dac2ad65e8..55fde0722b6b3 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_residual.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("../")
 import enum
 
 from conv2d_common import (
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
index 34d72a4c7443e..7c95892006c43 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_common.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import sys
 
-sys.path.append("../")
+dirname, filename = os.path.split(os.path.abspath(sys.argv[0]))
+sys.path.append(dirname + "/../")
 from util import SubstituteTemplate
 
 # For beginners, these template parameters may be difficult to understand.
@@ -90,14 +92,8 @@
   ImplicitGemm implicit_gemm_op;
   size_t bytes = implicit_gemm_op.get_workspace_size(arguments);
 
-  auto ctx = params.ctx;
-  auto stream = ctx->stream();
-  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data =
-       phi::memory_utils::Alloc(
-          ctx->GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-  void *workspace = tmp_gpu_ptrs_data->ptr();
+auto stream = params.stream;
+void *workspace = params.workspace;
 
   cutlass::Status status = implicit_gemm_op.can_implement(arguments);
   CUTLASS_CHECK(status);
@@ -122,7 +118,7 @@
 std::map<std::vector<int>, int> map_problem_${func_name};
 std::mutex ${func_name}_mutex;
 
-void ${func_name}(const ConvAllParams& params) {
+void ${func_name}(ConvAllParams params) {
   int batch = params.batch;
   int ic = params.ic;
   int ih = params.ih;
@@ -161,7 +157,7 @@
 # this function is invoked by phi kernel
 
 CommonWrapperForPhi = """
-void ${op_name}(const ConvAllParams& params) {
+void ${op_name}(ConvAllParams params) {
     ${dispatch_body}
 }
 """
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
index db0b9664c43ee..aaad46de5cb0d 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h
@@ -13,12 +13,9 @@
 // limitations under the License.
 #pragma once
 #include <cuda_fp16.h>
-#include <glog/logging.h>
 #include <map>
 #include <vector>
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
-
 namespace phi {
 namespace fusion {
 namespace cutlass_internal {
@@ -47,23 +44,27 @@ typedef struct {
   int oh;
   int ow;
   int groups;
-  const phi::GPUContext *ctx;
+  // const phi::GPUContext *ctx;
+  cudaStream_t stream;
   float alpha;  // for leaky_relu use
   int sm_version = 75;
+  void *workspace = nullptr;
 } ConvAllParams;
 
 // Below functions are provided by cutlass, they are called by phi.
-void Conv2dBiasAddRelu(const ConvAllParams &params);
-void Conv2dBiasRelu(const ConvAllParams &params);
-void Conv2dBiasLeakyRelu(const ConvAllParams &params);
-void Conv2dBiasSilu(const ConvAllParams &params);
-void Conv2dBias(const ConvAllParams &params);
-void Conv2dBiasSigmoid(const ConvAllParams &params);
+extern "C" void Conv2dBiasAddRelu(ConvAllParams params);
+extern "C" void Conv2dBiasRelu(ConvAllParams params);
+extern "C" void Conv2dBiasLeakyRelu(ConvAllParams params);
+extern "C" void Conv2dBiasSilu(ConvAllParams params);
+extern "C" void Conv2dBias(ConvAllParams params);
+extern "C" void Conv2dBiasSigmoid(ConvAllParams params);
+
+extern "C" void Conv2dDepthwiseBias(ConvAllParams params);
+extern "C" void Conv2dDepthwiseBiasRelu(ConvAllParams params);
+extern "C" void Conv2dDepthwiseBiasSigmoid(ConvAllParams params);
+extern "C" void Conv2dDepthwiseBiasSilu(ConvAllParams params);
 
-void Conv2dDepthwiseBias(const ConvAllParams &params);
-void Conv2dDepthwiseBiasRelu(const ConvAllParams &params);
-void Conv2dDepthwiseBiasSigmoid(const ConvAllParams &params);
-void Conv2dDepthwiseBiasSilu(const ConvAllParams &params);
+extern "C" int HelloFromCutlassConv2d(int a, int b);
 
 }  // namespace cutlass_internal
 }  // namespace fusion
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
index 0ea8e0a47130d..fb2f2be096110 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_depthwise_bias_act.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("../")
 import enum
 
 from conv2d_common import (
@@ -60,13 +57,7 @@
         CommonCutlassConv2dDepthwiseKernelDeclare, dict_for_declare_part
     )
     + '''
-size_t filter_size = oc * kh * kw * kc * sizeof(half);
-phi::Allocator::AllocationPtr filter_gpu_ptrs_data =
-    phi::memory_utils::Alloc(
-        params.ctx->GetPlace(),
-        filter_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(params.ctx->stream())));
-void *filter_workspace = filter_gpu_ptrs_data->ptr();
+      void *filter_workspace = params.workspace;
 
       typename ImplicitGemm::Arguments arguments{
           problem_size,
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
index 62b7b439458b9..51bc71983105a 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
@@ -16,8 +16,6 @@
 
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
 
-#include "glog/logging.h"
-
 namespace phi {
 namespace fusion {
 namespace cutlass_internal {
@@ -274,35 +272,40 @@ int ProfileToGetBestConfig(
     }
 
     cudaEvent_t beg, end;
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&beg));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&end));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(beg));
+    (cudaEventCreate(&beg));
+    (cudaEventCreate(&end));
+    (cudaEventRecord(beg));
     for (int ii = 0; ii < REPEAT; ii++) {
       status = func(params);
     }
 
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(end));
+    (cudaEventRecord(end));
+    (cudaEventSynchronize(end));
     float elapsed_time;
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventElapsedTime(&elapsed_time, beg, end));
+    (cudaEventElapsedTime(&elapsed_time, beg, end));
     if (elapsed_time < min_time && status == cutlass::Status::kSuccess) {
       min_time = elapsed_time;
       min_time_index = i;
       // debug code
-      VLOG(3) << OpType2String(op_type) << ": tactic " << i << " has max diff "
-              << conv2d_diff_gpu(params, op_type) << " compared with baseline,"
-              << "cost_time: " << elapsed_time << "ms.";
+      std::cout << OpType2String(op_type) << ": tactic " << i
+                << " has max diff " << conv2d_diff_gpu(params, op_type)
+                << " compared with baseline,"
+                << "cost_time: " << elapsed_time << "ms." << std::endl;
     }
   }
 
   if (min_time_index < 0) {
-    PADDLE_THROW(
-        phi::errors::NotFound("Can't find any cutlass config for this %s op.",
-                              OpType2String(op_type).c_str()));
+    std::cout << "Can't find any cutlass config for " << OpType2String(op_type)
+              << std::endl;
   }
   return min_time_index;
 }
 
+__attribute__((dllexport)) int HelloFromCutlassConv2d(int a, int b) {
+  std::cout << "welcom using Cutlass Conv2d" << std::endl;
+  return 1;
+}
+
 }  // namespace cutlass_internal
 }  // namespace fusion
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
index eaceb46d69d74..80865e0e1cded 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
@@ -17,25 +17,20 @@
 #include <vector>
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h"
 
-#include "glog/logging.h"
-
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm.h"
 
 #include "cutlass/conv/device/implicit_gemm_convolution.h"
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/enforce.h"
-
 namespace phi {
 namespace fusion {
 namespace cutlass_internal {
-#define CUTLASS_CHECK(status)                                                \
-  if (status != cutlass::Status::kSuccess) {                                 \
-    VLOG(3)                                                                  \
-        << "Cutlass can not deal with this problem size, skip this kernel!"; \
-    return status;                                                           \
+#define CUTLASS_CHECK(status)                                               \
+  if (status != cutlass::Status::kSuccess) {                                \
+    std::cout                                                               \
+        << "Cutlass can not deal with this problem size, skip this kernel!" \
+        << std::endl;                                                       \
+    return status;                                                          \
   }
 
 typedef enum {
diff --git a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
index ef803f0ea5f3d..dceaafd2e7172 100644
--- a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
@@ -12,14 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <glog/logging.h>
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h"
 
+#include "paddle/phi/backends/dynload/cutlass_conv2d.h"
+
 namespace phi {
 namespace fusion {
 namespace cutlass_internal {
 
+typedef void (*func)(phi::fusion::cutlass_internal::ConvAllParams);
+
 template <typename T, typename Context>
 void FusedConv2dAddActKernel(const Context& ctx,
                              const DenseTensor& x,
@@ -49,6 +57,7 @@ void FusedConv2dAddActKernel(const Context& ctx,
   CHECK_EQ(dilations.size() == 2UL, true);
 
   CHECK_EQ(padding_algorithm == "EXPLICIT", true);
+  CHECK_EQ(data_format == "NHWC", true);
   const int batch = in_dims[0];
   const int ic = in_dims[3];
   const int ih = in_dims[1];
@@ -112,27 +121,39 @@ void FusedConv2dAddActKernel(const Context& ctx,
                           oh,
                           ow,
                           groups,
-                          &ctx};
+                          ctx.stream()};
+
+  void* dlhandler = phi::dynload::GetCutlassConv2dHandle();
+  func conv_func = NULL;
+  CHECK_EQ(dlhandler == NULL, false);
 
   // conv2d_depthwise
   if (groups == ic && ic == oc) {
+    // conv2d_depthwise need a tmp workspace.
+    phi::Allocator::AllocationPtr tmp_ptr = phi::memory_utils::Alloc(
+        ctx.GetPlace(),
+        oc * kh * kw * sizeof(T),
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
+    params.workspace = tmp_ptr->ptr();
     // cutlass conv2d_depthwise not support residual
     if (residual) {
       CHECK_EQ(residual->data<T>() == nullptr, true);
     }
     if (activation == "relu") {
-      Conv2dDepthwiseBiasRelu(params);
+      conv_func = (func)(dlsym(dlhandler, "Conv2dDepthwiseBiasRelu"));
     } else if (activation == "identity") {
-      Conv2dDepthwiseBias(params);
+      conv_func = (func)(dlsym(dlhandler, "Conv2dDepthwiseBias"));
     } else if (activation == "sigmoid") {
-      Conv2dDepthwiseBiasSigmoid(params);
+      conv_func = (func)(dlsym(dlhandler, "Conv2dDepthwiseBiasSigmoid"));
     } else if (activation == "swish") {
-      Conv2dDepthwiseBiasSilu(params);
+      conv_func = (func)(dlsym(dlhandler, "Conv2dDepthwiseBiasSilu"));
     } else {
       PADDLE_THROW(phi::errors::InvalidArgument(
           "Cutlass conv2d_depthwise does not support this activation: %s.",
           activation.c_str()));
     }
+    conv_func(params);
+    output->set_layout(DataLayout::NHWC);
     return;
   }
 
@@ -141,26 +162,27 @@ void FusedConv2dAddActKernel(const Context& ctx,
   if (residual) {
     if (activation == "relu") {
       params.residual = reinterpret_cast<const half*>(residual->data<T>());
-      Conv2dBiasAddRelu(params);
+      conv_func = (func)(dlsym(dlhandler, "Conv2dBiasAddRelu"));
     } else {
       PADDLE_THROW(phi::errors::InvalidArgument(
           "Cutlass now only support relu activation in a residual block"));
     }
   } else if (activation == "relu") {
-    Conv2dBiasRelu(params);
+    conv_func = (func)(dlsym(dlhandler, "Conv2dBiasRelu"));
   } else if (activation == "swish") {
-    Conv2dBiasSilu(params);
+    conv_func = (func)(dlsym(dlhandler, "Conv2dBiasSilu"));
   } else if (activation == "identity") {
-    Conv2dBias(params);
+    conv_func = (func)(dlsym(dlhandler, "Conv2dBias"));
   } else if (activation == "leaky_relu") {
+    conv_func = (func)(dlsym(dlhandler, "Conv2dBiasLeakyRelu"));
     params.alpha = fuse_alpha;
-    Conv2dBiasLeakyRelu(params);
   } else if (activation == "sigmoid") {
-    Conv2dBiasSigmoid(params);
+    conv_func = (func)(dlsym(dlhandler, "Conv2dBiasSigmoid"));
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
         "Cutlass does not support this activation: %s.", activation.c_str()));
   }
+  conv_func(params);
   output->set_layout(DataLayout::NHWC);
 }
 }  // namespace cutlass_internal
diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt
index 185ca22f897f6..84abbaa986e61 100755
--- a/test/ir/inference/CMakeLists.txt
+++ b/test/ir/inference/CMakeLists.txt
@@ -138,19 +138,6 @@ if(WITH_XPU)
   endforeach()
 endif()
 
-# below are cutlass unittests
-file(
-  GLOB TEST_CUTLASS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_cutlass_*.py")
-string(REPLACE ".py" "" TEST_CUTLASS "${TEST_CUTLASS}")
-list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES ${TEST_CUTLASS})
-if(WITH_CUTLASS)
-  foreach(target ${TEST_CUTLASS})
-    py_test_modules(${target} MODULES ${target})
-  endforeach()
-endif()
-
 if(WITH_MKLDNN
    AND TENSORRT_FOUND
    AND WITH_GPU)

From 560225036959c49d7e6f523a546272e266be507b Mon Sep 17 00:00:00 2001
From: Lu Qi <61354321+MarioLulab@users.noreply.github.com>
Date: Thu, 22 Feb 2024 19:50:42 +0800
Subject: [PATCH 02/82] Support time_major for FusedROPE (#61417)

---
 paddle/phi/api/yaml/fused_backward.yaml       |   4 +-
 paddle/phi/api/yaml/fused_ops.yaml            |   2 +-
 paddle/phi/infermeta/backward.cc              |   1 +
 paddle/phi/infermeta/backward.h               |   1 +
 paddle/phi/infermeta/multiary.cc              |   1 +
 paddle/phi/infermeta/multiary.h               |   1 +
 paddle/phi/infermeta/spmd_rules/fused_rope.cc |  53 ++--
 paddle/phi/infermeta/spmd_rules/fused_rope.h  |   9 +-
 .../fusion/gpu/fused_rope_grad_kernel.cu      |  26 +-
 .../kernels/fusion/gpu/fused_rope_kernel.cu   |  27 +-
 .../phi/kernels/fusion/gpu/fused_rope_utils.h |  28 +-
 .../fused_rotary_position_embedding.py        |  11 +-
 .../semi_auto_parallel_for_fused_rope.py      |  92 +++++++
 test/cpp/auto_parallel/spmd_rule_test.cc      |   8 +-
 .../test_fused_rotary_position_embedding.py   | 241 ++++++++++++++----
 15 files changed, 412 insertions(+), 93 deletions(-)

diff --git a/paddle/phi/api/yaml/fused_backward.yaml b/paddle/phi/api/yaml/fused_backward.yaml
index 8a2a9786a837a..5c92b1a2a692f 100644
--- a/paddle/phi/api/yaml/fused_backward.yaml
+++ b/paddle/phi/api/yaml/fused_backward.yaml
@@ -40,8 +40,8 @@
   support_dygraph_mode : true
 
 - backward_op : fused_rotary_position_embedding_grad
-  forward: fused_rotary_position_embedding (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style) -> Tensor(out_q), Tensor(out_k), Tensor(out_v)
-  args : (Tensor sin, Tensor cos, Tensor position_ids, Tensor out_q_grad, Tensor out_k_grad,Tensor out_v_grad, bool use_neox_rotary_style)
+  forward: fused_rotary_position_embedding (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style, bool time_major) -> Tensor(out_q), Tensor(out_k), Tensor(out_v)
+  args : (Tensor sin, Tensor cos, Tensor position_ids, Tensor out_q_grad, Tensor out_k_grad,Tensor out_v_grad, bool use_neox_rotary_style, bool time_major)
   output : Tensor(q_grad), Tensor(k_grad), Tensor(v_grad)
   optional :  sin, cos, position_ids, out_k_grad, out_v_grad, k_grad, v_grad
   infer_meta :
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index f8dcb02cbdc72..2ca0a32be59f5 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -272,7 +272,7 @@
   optional : cache_kv, pre_caches, rotary_pos_emb, time_step, seq_lengths, src_mask, gather_index
 
 - op : fused_rotary_position_embedding
-  args : (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style = true)
+  args : (Tensor q, Tensor k, Tensor v, Tensor sin, Tensor cos, Tensor position_ids, bool use_neox_rotary_style = true, bool time_major = false)
   output : Tensor(out_q), Tensor(out_k), Tensor(out_v)
   infer_meta :
     func : FusedRopeInferMeta
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 54cf403533427..4f525ef138735 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -1349,6 +1349,7 @@ void FusedRopeGradInferMeta(const MetaTensor& sin,
                             const MetaTensor& dout_k,
                             const MetaTensor& dout_v,
                             bool use_neox_rotary_style,
+                            bool time_major,
                             MetaTensor* dq,
                             MetaTensor* dk,
                             MetaTensor* dv) {
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 577250723c12b..bde9c57ff245a 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -201,6 +201,7 @@ void FusedRopeGradInferMeta(const MetaTensor& sin,
                             const MetaTensor& dout_k,
                             const MetaTensor& dout_v,
                             bool use_neox_rotary_style,
+                            bool time_major,
                             MetaTensor* dq,
                             MetaTensor* dk,
                             MetaTensor* dv);
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 382fe7dd6c35b..978a80674272f 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -4501,6 +4501,7 @@ void FusedRopeInferMeta(const MetaTensor& q,
                         const MetaTensor& cos,
                         const MetaTensor& position_ids,
                         bool use_neox_rotary_style,
+                        bool time_major,
                         MetaTensor* out_q,
                         MetaTensor* out_k,
                         MetaTensor* out_v) {
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 13747ba7595cc..0774189dd8d4f 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -887,6 +887,7 @@ void FusedRopeInferMeta(const MetaTensor& q,
                         const MetaTensor& cos,
                         const MetaTensor& position_ids,
                         bool use_neox_rotary_style,
+                        bool time_major,
                         MetaTensor* out_q,
                         MetaTensor* out_k,
                         MetaTensor* out_v);
diff --git a/paddle/phi/infermeta/spmd_rules/fused_rope.cc b/paddle/phi/infermeta/spmd_rules/fused_rope.cc
index d744c73681c3e..138f0813be2c5 100644
--- a/paddle/phi/infermeta/spmd_rules/fused_rope.cc
+++ b/paddle/phi/infermeta/spmd_rules/fused_rope.cc
@@ -25,8 +25,6 @@ namespace phi {
 namespace distributed {
 
 using auto_parallel::str_join;
-const int kBatchDimIndex = 0;
-const int kSeqlenDimIndex = 1;
 const int kNumHeadsDimIndex = 2;
 const int kHeadDimIndex = 3;
 
@@ -82,7 +80,8 @@ void check_k_or_v(const DistMetaTensor& k_or_v,
 void check_sin_cos(const DistMetaTensor& sin,
                    const DistMetaTensor& cos,
                    const DistMetaTensor& position_ids,
-                   const std::vector<int64_t>& q_shape) {
+                   const std::vector<int64_t>& q_shape,
+                   bool time_major) {
   PADDLE_ENFORCE_EQ(sin.dims(),
                     cos.dims(),
                     phi::errors::InvalidArgument(
@@ -99,6 +98,9 @@ void check_sin_cos(const DistMetaTensor& sin,
       phi::errors::InvalidArgument(
           "The Tensor sin/cos's ndim must be 2 or 4. but given [%d]", ndim));
 
+  const int kBatchDimIndex = time_major ? 1 : 0;
+  const int kSeqlenDimIndex = time_major ? 0 : 1;
+
   int batch_size = q_shape[kBatchDimIndex];
   int seq_len = q_shape[kSeqlenDimIndex];
   int head_dim = q_shape[kHeadDimIndex];
@@ -107,11 +109,11 @@ void check_sin_cos(const DistMetaTensor& sin,
   int head_dim_index = ndim == 2 ? 1 : 3;
   if (ndim == 4) {
     PADDLE_ENFORCE_EQ(
-        (shape[kBatchDimIndex] == 1 && shape[kNumHeadsDimIndex] == 1),
+        (shape[0] == 1 && shape[kNumHeadsDimIndex] == 1),
         true,
         phi::errors::InvalidArgument("The batch_size and num_heads of sin/cos "
                                      "must be 1, but given [%d], [%d]",
-                                     shape[kBatchDimIndex],
+                                     shape[0],
                                      shape[kNumHeadsDimIndex]));
   }
 
@@ -161,6 +163,7 @@ void infer_sin_cos(const DistMetaTensor& sin,
                    const DistMetaTensor& cos,
                    const DistMetaTensor& position_ids,
                    const std::vector<int64_t>& q_shape,
+                   bool time_major,
                    TensorDistAttr* sin_dist_attr_dst,
                    TensorDistAttr* cos_dist_attr_dst) {
   const TensorDistAttr& sin_dist_attr_src = sin.dist_attr();
@@ -175,7 +178,7 @@ void infer_sin_cos(const DistMetaTensor& sin,
   // if one of sin cos is empty, they are all useless in kernel
   if (!IsEmpty(sin_shape) && !IsEmpty(cos_shape)) {
     // check sin, cos, position_ids's shape
-    check_sin_cos(sin, cos, position_ids, q_shape);
+    check_sin_cos(sin, cos, position_ids, q_shape, time_major);
     if (sin_shape.size() == 4) {
       *sin_dist_attr_dst = UnShardTensorDims(sin_dist_attr_src, {1, 3});
       *cos_dist_attr_dst = UnShardTensorDims(cos_dist_attr_src, {1, 3});
@@ -192,7 +195,8 @@ SpmdInfo FusedRopeInferSpmd(const DistMetaTensor& q,
                             const DistMetaTensor& sin,
                             const DistMetaTensor& cos,
                             const DistMetaTensor& position_ids,
-                            bool use_neox_rotary_style) {
+                            bool use_neox_rotary_style,
+                            bool time_major) {
   check_q(q);
 
   std::vector<std::pair<std::string, std::vector<int64_t>>>
@@ -202,7 +206,8 @@ SpmdInfo FusedRopeInferSpmd(const DistMetaTensor& q,
   inputs_sharding_info.emplace_back(qkv_axes, q_dist_attr_src.dims_mapping());
 
   const TensorDistAttr& k_dist_attr_src = k.dist_attr();
-  // q_shape = [bs, seq_len, num_heads, head_dim]
+  // q_shape equals [bs, seq_len, num_heads, head_dim] if time_major is False,
+  // otherwise [seq_len, bs, num_heads, head_dim]
   std::vector<int64_t> q_shape = common::vectorize(q.dims());
   bool is_k_none = IsEmpty(common::vectorize(k.dims()));
   // except for q, all other inputs are optional.
@@ -219,7 +224,7 @@ SpmdInfo FusedRopeInferSpmd(const DistMetaTensor& q,
   }
 
   const TensorDistAttr& position_ids_dist_attr_src = position_ids.dist_attr();
-  std::string position_ids_axes = "ab";
+  std::string position_ids_axes = time_major ? "ba" : "ab";
   bool is_ids_none = IsEmpty(common::vectorize(position_ids.dims()));
   if (!is_ids_none) {
     inputs_sharding_info.emplace_back(
@@ -232,7 +237,9 @@ SpmdInfo FusedRopeInferSpmd(const DistMetaTensor& q,
       GetDimsMappingForAxes(qkv_axes, axis_to_dim_map);
   TensorDistAttr q_dist_attr_dst = CopyTensorDistAttrForOutput(q_dist_attr_src);
   q_dist_attr_dst.set_dims_mapping(out_dims_mapping);
-  q_dist_attr_dst = UnShardTensorDims(q_dist_attr_dst, {1, 3});
+  const int kSeqlenDimIndex = time_major ? 0 : 1;
+  q_dist_attr_dst =
+      UnShardTensorDims(q_dist_attr_dst, {kSeqlenDimIndex, kHeadDimIndex});
 
   TensorDistAttr k_dist_attr_dst = CopyTensorDistAttrForOutput(k_dist_attr_src);
   k_dist_attr_dst.set_process_mesh(q_dist_attr_dst.process_mesh());
@@ -248,8 +255,13 @@ SpmdInfo FusedRopeInferSpmd(const DistMetaTensor& q,
 
   TensorDistAttr sin_dist_attr_dst;
   TensorDistAttr cos_dist_attr_dst;
-  infer_sin_cos(
-      sin, cos, position_ids, q_shape, &sin_dist_attr_dst, &cos_dist_attr_dst);
+  infer_sin_cos(sin,
+                cos,
+                position_ids,
+                q_shape,
+                time_major,
+                &sin_dist_attr_dst,
+                &cos_dist_attr_dst);
 
   std::vector<int64_t> position_ids_dims_mapping =
       GetDimsMappingForAxes(position_ids_axes, axis_to_dim_map);
@@ -279,7 +291,8 @@ SpmdInfo FusedRopeInferSpmdReverse(const DistMetaTensor& q,
                                    const DistMetaTensor& out_q,
                                    const DistMetaTensor& out_k,
                                    const DistMetaTensor& out_v,
-                                   bool use_neox_rotary_style) {
+                                   bool use_neox_rotary_style,
+                                   bool time_major) {
   check_q(out_q);
   std::vector<std::pair<std::string, std::vector<int64_t>>>
       outputs_sharding_info;
@@ -316,7 +329,10 @@ SpmdInfo FusedRopeInferSpmdReverse(const DistMetaTensor& q,
   TensorDistAttr q_dist_attr_dst =
       CopyTensorDistAttrForOutput(out_q_dist_attr_src);
   q_dist_attr_dst.set_dims_mapping(dims_mapping);
-  q_dist_attr_dst = UnShardTensorDims(q_dist_attr_dst, {1, 3});
+
+  const int kSeqlenDimIndex = time_major ? 0 : 1;
+  q_dist_attr_dst =
+      UnShardTensorDims(q_dist_attr_dst, {kSeqlenDimIndex, kHeadDimIndex});
   TensorDistAttr out_q_dist_attr_dst = q_dist_attr_dst;
 
   TensorDistAttr k_dist_attr_dst = CopyTensorDistAttrForOutput(k.dist_attr());
@@ -341,10 +357,11 @@ SpmdInfo FusedRopeInferSpmdReverse(const DistMetaTensor& q,
                 cos,
                 position_ids,
                 out_q_shape,
+                time_major,
                 &sin_dist_attr_dst,
                 &cos_dist_attr_dst);
 
-  std::string position_ids_axes = "ab";
+  std::string position_ids_axes = time_major ? "ba" : "ab";
   std::vector<int64_t> position_ids_dims_mapping =
       GetDimsMappingForAxes(position_ids_axes, axis_to_dim_map);
   TensorDistAttr position_ids_dist_attr_dst =
@@ -372,7 +389,8 @@ SpmdInfo FusedRopeGradInferSpmd(const DistMetaTensor& sin,
                                 const DistMetaTensor& out_q_grad,
                                 const DistMetaTensor& out_k_grad,
                                 const DistMetaTensor& out_v_grad,
-                                bool use_neox_rotary_style) {
+                                bool use_neox_rotary_style,
+                                bool time_major) {
   // NOTE(zhonghui): The forward and backward kernels of fuse rope are same, so
   // the spmd rules can be shared.
   SpmdInfo spmd_info = FusedRopeInferSpmd(out_q_grad,
@@ -381,7 +399,8 @@ SpmdInfo FusedRopeGradInferSpmd(const DistMetaTensor& sin,
                                           sin,
                                           cos,
                                           position_ids,
-                                          use_neox_rotary_style);
+                                          use_neox_rotary_style,
+                                          time_major);
   std::vector<ArgDistAttr> dist_attrs;
   std::vector<int> order = {3, 4, 5, 0, 1, 2};
   for (int ind : order) {
diff --git a/paddle/phi/infermeta/spmd_rules/fused_rope.h b/paddle/phi/infermeta/spmd_rules/fused_rope.h
index f28015bc528f1..fdd9ae27500b0 100644
--- a/paddle/phi/infermeta/spmd_rules/fused_rope.h
+++ b/paddle/phi/infermeta/spmd_rules/fused_rope.h
@@ -29,7 +29,8 @@ SpmdInfo FusedRopeInferSpmd(const DistMetaTensor& q,
                             const DistMetaTensor& sin,
                             const DistMetaTensor& cos,
                             const DistMetaTensor& position_ids,
-                            bool use_neox_rotary_style);
+                            bool use_neox_rotary_style,
+                            bool time_major);
 
 SpmdInfo FusedRopeInferSpmdReverse(const DistMetaTensor& q,
                                    const DistMetaTensor& k,
@@ -40,7 +41,8 @@ SpmdInfo FusedRopeInferSpmdReverse(const DistMetaTensor& q,
                                    const DistMetaTensor& out_q,
                                    const DistMetaTensor& out_k,
                                    const DistMetaTensor& out_v,
-                                   bool use_neox_rotary_style);
+                                   bool use_neox_rotary_style,
+                                   bool time_major);
 
 SpmdInfo FusedRopeGradInferSpmd(const DistMetaTensor& sin,
                                 const DistMetaTensor& cos,
@@ -48,7 +50,8 @@ SpmdInfo FusedRopeGradInferSpmd(const DistMetaTensor& sin,
                                 const DistMetaTensor& out_q_grad,
                                 const DistMetaTensor& out_k_grad,
                                 const DistMetaTensor& out_v_grad,
-                                bool use_neox_rotary_style);
+                                bool use_neox_rotary_style,
+                                bool time_major);
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
index 787b215d20f37..f7fd4d8589aac 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_grad_kernel.cu
@@ -32,6 +32,7 @@ void FusedRopeGradKernel(const Context& dev_ctx,
                          const paddle::optional<DenseTensor>& dout_k,
                          const paddle::optional<DenseTensor>& dout_v,
                          bool use_neox_rotary_style,
+                         bool time_major,
                          DenseTensor* dq,
                          DenseTensor* dk,
                          DenseTensor* dv) {
@@ -41,10 +42,10 @@ void FusedRopeGradKernel(const Context& dev_ctx,
 
   phi::Array<int64_t, 3> inputs_num_heads;
   // small size for broadcast
-  auto batch_size = dout_q.dims()[0];
+  auto batch_size = time_major ? dout_q.dims()[1] : dout_q.dims()[0];
+  auto seq_len = time_major ? dout_q.dims()[0] : dout_q.dims()[1];
   inputs_num_heads[0] = dout_q.dims()[2];
   auto head_dim = dout_q.dims()[3];
-  auto seq_len = dout_q.dims()[1];
   PADDLE_ENFORCE_NE(head_dim % 2,
                     1,
                     phi::errors::InvalidArgument(
@@ -117,6 +118,9 @@ void FusedRopeGradKernel(const Context& dev_ctx,
           : VectorizedFusedRopeWithRotateHalfKernel<T, MPType, vec_size>;
 
   if (is_same_num_heads) {
+    int64_t batch_stride =
+        time_major ? dout_q.strides()[1] : dout_q.strides()[0];
+    int64_t seq_stride = time_major ? dout_q.strides()[0] : dout_q.strides()[1];
     kernel_func<<<grid, block, 0, stream>>>(ins_data,
                                             sin_cos_data,
                                             position_ids_data,
@@ -126,13 +130,18 @@ void FusedRopeGradKernel(const Context& dev_ctx,
                                             seq_len,
                                             inputs_num_heads[0],
                                             head_dim,
+                                            batch_stride,
+                                            seq_stride,
                                             outs_data,
                                             num_inputs,
                                             div_c);
 
   } else {
     // rotary position embedding Q
-
+    int64_t batch_stride_q =
+        time_major ? dout_q.strides()[1] : dout_q.strides()[0];
+    int64_t seq_stride_q =
+        time_major ? dout_q.strides()[0] : dout_q.strides()[1];
     kernel_func<<<grid, block, 0, stream>>>(ins_data,
                                             sin_cos_data,
                                             position_ids_data,
@@ -142,11 +151,20 @@ void FusedRopeGradKernel(const Context& dev_ctx,
                                             seq_len,
                                             inputs_num_heads[0],
                                             head_dim,
+                                            batch_stride_q,
+                                            seq_stride_q,
                                             outs_data,
                                             1,
                                             div_c);
 
     // rotary position embedding K,V
+    int64_t batch_stride_kv = time_major
+                                  ? inputs_num_heads[1] * head_dim
+                                  : seq_len * inputs_num_heads[1] * head_dim;
+    int64_t seq_stride_kv = time_major
+                                ? batch_size * inputs_num_heads[1] * head_dim
+                                : inputs_num_heads[1] * head_dim;
+
     phi::Array<const T*, 3> input_kv{ins_data[1], ins_data[2], nullptr};
     phi::Array<T*, 3> out_kv{outs_data[1], outs_data[2], nullptr};
     kernel_func<<<grid, block, 0, stream>>>(input_kv,
@@ -158,6 +176,8 @@ void FusedRopeGradKernel(const Context& dev_ctx,
                                             seq_len,
                                             inputs_num_heads[1],
                                             head_dim,
+                                            batch_stride_kv,
+                                            seq_stride_kv,
                                             out_kv,
                                             num_inputs - 1,
                                             div_c);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
index 46a2a0a065652..62c09235f09d8 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_kernel.cu
@@ -32,6 +32,7 @@ void FusedRopeKernel(const Context& dev_ctx,
                      const paddle::optional<DenseTensor>& cos,
                      const paddle::optional<DenseTensor>& position_ids,
                      bool use_neox_rotary_style,
+                     bool time_major,
                      DenseTensor* out_q,
                      DenseTensor* out_k,
                      DenseTensor* out_v) {
@@ -41,9 +42,10 @@ void FusedRopeKernel(const Context& dev_ctx,
 
   phi::Array<int64_t, 3> inputs_num_heads;
 
-  // q.shape: [batch_size, seq_len, num_heads, head_dim]
-  auto batch_size = q.dims()[0];
-  auto seq_len = q.dims()[1];
+  // q.shape: [seq_len, batch_size, num_heads, head_dim] if time_major else
+  // [batch_size, seq_len, num_heads, head_dim]
+  auto batch_size = time_major ? q.dims()[1] : q.dims()[0];
+  auto seq_len = time_major ? q.dims()[0] : q.dims()[1];
   inputs_num_heads[0] = q.dims()[2];
   auto head_dim = q.dims()[3];
 
@@ -187,6 +189,8 @@ void FusedRopeKernel(const Context& dev_ctx,
           : VectorizedFusedRopeWithRotateHalfKernel<T, MPType, vec_size>;
 
   if (is_same_num_heads) {
+    int64_t batch_stride = time_major ? q.strides()[1] : q.strides()[0];
+    int64_t seq_stride = time_major ? q.strides()[0] : q.strides()[1];
     kernel_func<<<grid, block, 0, stream>>>(ins_data,
                                             sin_cos_data,
                                             position_ids_data,
@@ -196,10 +200,11 @@ void FusedRopeKernel(const Context& dev_ctx,
                                             seq_len,
                                             inputs_num_heads[0],
                                             head_dim,
+                                            batch_stride,
+                                            seq_stride,
                                             outs_data,
                                             num_inputs,
                                             div_c);
-
   } else {
     // Multi Query Attention (MQA) or Group Query Attention (GQA)
     PADDLE_ENFORCE_EQ(
@@ -226,6 +231,9 @@ void FusedRopeKernel(const Context& dev_ctx,
               inputs_num_heads[2]));
     }
     // rotary position embedding Q
+    int64_t batch_stride_q = time_major ? q.strides()[1] : q.strides()[0];
+    int64_t seq_stride_q = time_major ? q.strides()[0] : q.strides()[1];
+
     kernel_func<<<grid, block, 0, stream>>>(ins_data,
                                             sin_cos_data,
                                             position_ids_data,
@@ -235,6 +243,8 @@ void FusedRopeKernel(const Context& dev_ctx,
                                             seq_len,
                                             inputs_num_heads[0],
                                             head_dim,
+                                            batch_stride_q,
+                                            seq_stride_q,
                                             outs_data,
                                             1,
                                             div_c);
@@ -242,6 +252,13 @@ void FusedRopeKernel(const Context& dev_ctx,
     // rotary position embedding K,V
     phi::Array<const T*, 3> input_kv{ins_data[1], ins_data[2], nullptr};
     phi::Array<T*, 3> out_kv{outs_data[1], outs_data[2], nullptr};
+    int64_t batch_stride_kv = time_major
+                                  ? inputs_num_heads[1] * head_dim
+                                  : seq_len * inputs_num_heads[1] * head_dim;
+    int64_t seq_stride_kv = time_major
+                                ? batch_size * inputs_num_heads[1] * head_dim
+                                : inputs_num_heads[1] * head_dim;
+
     kernel_func<<<grid, block, 0, stream>>>(input_kv,
                                             sin_cos_data,
                                             position_ids_data,
@@ -251,6 +268,8 @@ void FusedRopeKernel(const Context& dev_ctx,
                                             seq_len,
                                             inputs_num_heads[1],
                                             head_dim,
+                                            batch_stride_kv,
+                                            seq_stride_kv,
                                             out_kv,
                                             num_inputs - 1,
                                             div_c);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
index 44f68f42e6581..34dab8dab7d0d 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
@@ -30,6 +30,8 @@ using VectorizedFusedRopeCudaKernelFunc =
              int64_t seq_len,
              int64_t num_heads,
              int64_t head_dim,
+             int64_t batch_stride,
+             int64_t seq_stride,
              phi::Array<T*, 3> outs_data,
              int num_inputs,
              MPType div_c);
@@ -39,9 +41,12 @@ __device__ void VectorizedGetSinCos(phi::Array<const T*, 2> sin_cos_data,
                                     const int64_t* position_ids_data,
                                     bool flag_sin_cos,
                                     int64_t index,
+                                    int64_t batch_size,
                                     int64_t seq_len,
                                     int64_t num_heads,
                                     int64_t head_dim,
+                                    int64_t batch_stride,
+                                    int64_t seq_stride,
                                     MPType* out_sin,
                                     MPType* out_cos,
                                     MPType div_c) {
@@ -51,17 +56,16 @@ __device__ void VectorizedGetSinCos(phi::Array<const T*, 2> sin_cos_data,
   if (flag_sin_cos) {
 #pragma unroll
     for (int64_t nx = 0; nx < VecSize; ++nx) {
-      int64_t index_wc = (index + nx) % (seq_len * num_heads * head_dim);
-      int64_t pos_seq_ori = index_wc / (num_heads * head_dim);
+      int64_t pos_seq_ori = (index + nx) / seq_stride % seq_len;
       int64_t pos_seq;
       if (position_ids_data) {
-        int64_t pos_bs = (index + nx) / (seq_len * num_heads * head_dim);
+        int64_t pos_bs = (index + nx) / batch_stride % batch_size;
         int64_t index_ids = pos_bs * seq_len + pos_seq_ori;
         pos_seq = position_ids_data[index_ids];
       } else {
         pos_seq = pos_seq_ori;
       }
-      int64_t pos_head = index_wc % head_dim;
+      int64_t pos_head = (index + nx) % head_dim;
       int64_t index_sc = pos_seq * head_dim + pos_head;
       const T* sin_input = sin_cos_data[0] + index_sc;
       const T* cos_input = sin_cos_data[1] + index_sc;
@@ -73,9 +77,9 @@ __device__ void VectorizedGetSinCos(phi::Array<const T*, 2> sin_cos_data,
 #pragma unroll
     for (int nx = 0; nx < VecSize; ++nx) {
       // get sin_index and cos_index
-      int64_t index_wc = (index + nx) % (seq_len * num_heads * head_dim);
-      int64_t pos_seq = index_wc / (num_heads * head_dim);
-      MPType idx = static_cast<MPType>((index_wc % head_dim) / 2 * 2.0);
+      int64_t pos_seq = (index + nx) / seq_stride % seq_len;
+
+      MPType idx = static_cast<MPType>(((index + nx) % head_dim) / 2 * 2.0);
       MPType indicses =
           static_cast<MPType>(1) /
           pow(static_cast<MPType>(10000), idx * static_cast<MPType>(div_c));
@@ -97,6 +101,8 @@ __global__ void VectorizedFusedRopeWithRotateEveryTwoKernel(
     int64_t seq_len,
     int64_t num_heads,
     int64_t head_dim,
+    int64_t batch_stride,
+    int64_t seq_stride,
     phi::Array<T*, 3> outs_data,
     int num_inputs,
     MPType div_c) {
@@ -119,9 +125,12 @@ __global__ void VectorizedFusedRopeWithRotateEveryTwoKernel(
                         position_ids_data,
                         flag_sin_cos,
                         index,
+                        batch_size,
                         seq_len,
                         num_heads,
                         head_dim,
+                        batch_stride,
+                        seq_stride,
                         sin_value,
                         cos_value,
                         div_c);
@@ -172,6 +181,8 @@ __global__ void VectorizedFusedRopeWithRotateHalfKernel(
     int64_t seq_len,
     int64_t num_heads,
     int64_t head_dim,
+    int64_t batch_stride,
+    int64_t seq_stride,
     phi::Array<T*, 3> outs_data,
     int num_inputs,
     MPType div_c) {
@@ -194,9 +205,12 @@ __global__ void VectorizedFusedRopeWithRotateHalfKernel(
                         position_ids_data,
                         flag_sin_cos,
                         index,
+                        batch_size,
                         seq_len,
                         num_heads,
                         head_dim,
+                        batch_stride,
+                        seq_stride,
                         sin_value,
                         cos_value,
                         div_c);
diff --git a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
index 78996a34eeccd..59984b9a68e69 100644
--- a/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
+++ b/python/paddle/incubate/nn/functional/fused_rotary_position_embedding.py
@@ -26,18 +26,20 @@ def fused_rotary_position_embedding(
     cos=None,
     position_ids=None,
     use_neox_rotary_style=True,
+    time_major=False,
 ):
     r"""
     Fused rotary position embedding.
 
     Args:
-        q (Tensor): The input tensor. The data type is bfloat16, float16, float32 or float64. The shape of q must be [batch_size, seq_len, num_heads, head_dim] and head_dim must be a multiple of 2.
-        k (Tensor, optional): The input tensor. The data type is bfloat16, float16, float32 or float64. The shape of k must be [batch_size, seq_len, num_heads, head_dim] and head_dim must be a multiple of 2.
-        v (Tensor, optional): The input tensor. The data type is bfloat16, float16, float32 or float64. The shape of v must be [batch_size, seq_len, num_heads, head_dim] and head_dim must be a multiple of 2.
+        q (Tensor): The input tensor. The data type is bfloat16, float16, float32 or float64. The shape of q must be [batch_size, seq_len, num_heads, head_dim] or [seq_len, batch_size, num_heads, head_dim] and head_dim must be a multiple of 2.
+        k (Tensor, optional): The input tensor. The data type is bfloat16, float16, float32 or float64. The shape of k must be [batch_size, seq_len, num_heads, head_dim] or [seq_len, batch_size, num_heads, head_dim] and head_dim must be a multiple of 2.
+        v (Tensor, optional): The input tensor. The data type is bfloat16, float16, float32 or float64. The shape of v must be [batch_size, seq_len, num_heads, head_dim] or [seq_len, batch_size, num_heads, head_dim] and head_dim must be a multiple of 2.
         sin (Tensor, optional): The input tensor. The data type is bfloat16, float16, float32 or float64. The shape of sin must be [seq_len, head_dim] or [1, seq_len, 1, head_dim] and head_dim must be a multiple of 2.
         cos (Tensor, optional): The input tensor. The data type is bfloat16, float16, float32 or float64. The shape of cos must be [seq_len, head_dim] or [1, seq_len, 1, head_dim] and head_dim must be a multiple of 2.
         position_ids (Tensor, optional): The input tensor. The data type is int64. The shape of position_ids must be [batch_size, seq_len].
         use_neox_rotary_style(optional|bool): When the use_neox_rotary_style is True, every two adjacent numbers are calculated. When the use_neox_rotary_style is False, the numbers corresponding to the positions of the front half and back half segments are calculated. Default True.
+        time_major(optional|bool): Whether the first dimension of the q, k, v input means the time steps. If time_major is True, the shape of Tensor is [seq_len, batch_size, num_heads, head_dim], otherwise [batch_size, seq_len, num_heads, head_dime]. Defaults to False. `time_steps` means the length of input sequence.
 
     Returns:
         out_q/out_k/out_v Tensor representing the fused rotary position embedding, has same shape and data type as `q` .
@@ -89,7 +91,7 @@ def fused_rotary_position_embedding(
     """
     if in_dynamic_or_pir_mode():
         return _C_ops.fused_rotary_position_embedding(
-            q, k, v, sin, cos, position_ids, use_neox_rotary_style
+            q, k, v, sin, cos, position_ids, use_neox_rotary_style, time_major
         )
 
     helper = LayerHelper('fused_rotary_position_embedding', **locals())
@@ -120,6 +122,7 @@ def fused_rotary_position_embedding(
         outputs=outputs,
         attrs={
             'use_neox_rotary_style': use_neox_rotary_style,
+            'time_major': time_major,
         },
     )
 
diff --git a/test/auto_parallel/semi_auto_parallel_for_fused_rope.py b/test/auto_parallel/semi_auto_parallel_for_fused_rope.py
index 23e5db193e38f..397399dd5d799 100644
--- a/test/auto_parallel/semi_auto_parallel_for_fused_rope.py
+++ b/test/auto_parallel/semi_auto_parallel_for_fused_rope.py
@@ -72,6 +72,31 @@ def test_only_q_input(self):
         out_q.backward()
         self.check_tensor_eq(dist_q.grad, q.grad)
 
+    def test_only_q_input_time_major(self):
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+        # [seq_len, bs, num_heads, head_dim]
+        qkv_shape = [self._seq_len, self._bs, self._num_heads, self._head_dim]
+        q = paddle.randn(qkv_shape, self._dtype)
+        q.stop_gradient = False
+
+        dist_q = dist.shard_tensor(q, self._mesh, dist.Shard(0))
+        dist_q.stop_gradient = False
+
+        dist_out_q, _, _ = fused_rotary_position_embedding(
+            q=dist_q, use_neox_rotary_style=False, time_major=True
+        )
+        out_q, _, _ = fused_rotary_position_embedding(
+            q, use_neox_rotary_style=False, time_major=True
+        )
+        self.check_tensor_eq(out_q, dist_out_q)
+        # NOTE: fused_rope have not supported shard on seq_len, so reshard to dist.Replicate
+        self.check_placements(dist_out_q, [dist.Replicate()])
+
+        dist_out_q.backward()
+        out_q.backward()
+        self.check_tensor_eq(dist_q.grad, q.grad)
+
     def test_common_case(self):
         paddle.seed(self._seed)
         np.random.seed(self._seed)
@@ -133,6 +158,71 @@ def test_common_case(self):
         self.check_tensor_eq(dist_q.grad, q.grad)
         self.check_tensor_eq(dist_k.grad, k.grad)
 
+    def test_common_case_time_major(self):
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+        # [seq_len, bs, num_heads, head_dim]
+        qkv_shape = [self._seq_len, self._bs, self._num_heads, self._head_dim]
+        q = paddle.randn(qkv_shape, self._dtype)
+        q.stop_gradient = False
+
+        dist_q = dist.shard_tensor(q, self._mesh, dist.Shard(1))
+        dist_q.stop_gradient = False
+
+        k = paddle.randn(qkv_shape, self._dtype)
+        k.stop_gradient = False
+        dist_k = dist.shard_tensor(k, self._mesh, dist.Shard(2))
+        dist_k.stop_gradient = False
+
+        sin = paddle.randn(self._sin_cos_shape, self._dtype)
+        sin.stop_gradient = True
+        dist_sin = dist.shard_tensor(sin, self._mesh, dist.Replicate())
+        dist_sin.stop_gradient = True
+
+        cos = paddle.randn(self._sin_cos_shape, self._dtype)
+        cos.stop_gradient = True
+        dist_cos = dist.shard_tensor(cos, self._mesh, dist.Replicate())
+        dist_cos.stop_gradient = True
+
+        position_ids = paddle.arange(self._seq_len, dtype="int64").expand(
+            (self._bs, self._seq_len)
+        )
+        position_ids.stop_gradient = True
+        dist_position_ids = dist.shard_tensor(
+            position_ids, self._mesh, dist.Shard(0)
+        )
+        dist_position_ids.stop_gradient = True
+
+        dist_out_q, dist_out_k, _ = fused_rotary_position_embedding(
+            q=dist_q,
+            k=dist_k,
+            sin=dist_sin,
+            cos=dist_cos,
+            position_ids=dist_position_ids,
+            use_neox_rotary_style=False,
+            time_major=True,
+        )
+        out_q, out_k, _ = fused_rotary_position_embedding(
+            q=q,
+            k=k,
+            sin=sin,
+            cos=cos,
+            position_ids=position_ids,
+            use_neox_rotary_style=False,
+            time_major=True,
+        )
+
+        self.check_tensor_eq(out_q, dist_out_q)
+        self.check_tensor_eq(out_k, dist_out_k)
+
+        dist_out = dist_out_q + dist_out_k
+        out = out_q + out_k
+        dist_out.backward()
+        out.backward()
+
+        self.check_tensor_eq(dist_q.grad, q.grad)
+        self.check_tensor_eq(dist_k.grad, k.grad)
+
     def run_test_case(self):
         if self._backend == "gpu":
             paddle.set_device("gpu:" + str(dist.get_rank()))
@@ -142,7 +232,9 @@ def run_test_case(self):
             )
 
         self.test_only_q_input()
+        self.test_only_q_input_time_major()
         self.test_common_case()
+        self.test_common_case_time_major()
 
 
 if __name__ == '__main__':
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index ba9bb664c2fd4..25e99fb52575b 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -1213,7 +1213,7 @@ TEST(FusedRope, Ctor) {
   // 1.1 only q input
   phi::distributed::SpmdInfo forward_spmd_info =
       phi::distributed::FusedRopeInferSpmd(
-          q, none, none, none, none, none, false);
+          q, none, none, none, none, none, false, false);
   EXPECT_EQ(forward_spmd_info.first.size(), static_cast<size_t>(6));
   EXPECT_EQ(forward_spmd_info.second.size(), static_cast<size_t>(3));
   check_dim_mapping(forward_spmd_info.first[0], {0, -1, -1, -1});
@@ -1237,7 +1237,7 @@ TEST(FusedRope, Ctor) {
   phi::distributed::DistMetaTensor position_ids =
       build_input({16, 2048}, {0, 1});
   forward_spmd_info = phi::distributed::FusedRopeInferSpmd(
-      q, k, none, sin, cos, position_ids, false);
+      q, k, none, sin, cos, position_ids, false, false);
   EXPECT_EQ(forward_spmd_info.first.size(), static_cast<size_t>(6));
   EXPECT_EQ(forward_spmd_info.second.size(), static_cast<size_t>(3));
   check_dim_mapping(forward_spmd_info.first[0], {0, -1, -1, -1});
@@ -1253,7 +1253,7 @@ TEST(FusedRope, Ctor) {
   check_partial_dims(forward_spmd_info.second[1], {});
   // 2. test backward
   phi::distributed::SpmdInfo backward_spmd_info =
-      FusedRopeGradInferSpmd(sin, cos, position_ids, q, k, none, false);
+      FusedRopeGradInferSpmd(sin, cos, position_ids, q, k, none, false, false);
   EXPECT_EQ(backward_spmd_info.first.size(), static_cast<size_t>(6));
   EXPECT_EQ(backward_spmd_info.second.size(), static_cast<size_t>(3));
   check_dim_mapping(backward_spmd_info.first[0], {-1, -1, -1, -1});
@@ -1274,7 +1274,7 @@ TEST(FusedRope, Ctor) {
   phi::distributed::DistMetaTensor out_k =
       build_input({16, 2048, 64, 128}, {-1, 1, -1, 0});
   phi::distributed::SpmdInfo reverse_spmd_info = FusedRopeInferSpmdReverse(
-      q, k, none, sin, cos, position_ids, out_q, out_k, none, false);
+      q, k, none, sin, cos, position_ids, out_q, out_k, none, false, false);
   EXPECT_EQ(reverse_spmd_info.first.size(), static_cast<size_t>(6));
   EXPECT_EQ(reverse_spmd_info.second.size(), static_cast<size_t>(3));
   check_dim_mapping(reverse_spmd_info.first[0], {0, -1, -1, -1});
diff --git a/test/legacy_test/test_fused_rotary_position_embedding.py b/test/legacy_test/test_fused_rotary_position_embedding.py
index c63c9d8bc1843..cc0afe5202fd1 100644
--- a/test/legacy_test/test_fused_rotary_position_embedding.py
+++ b/test/legacy_test/test_fused_rotary_position_embedding.py
@@ -109,6 +109,7 @@ def paddle_fused_rotary_position_embedding(
     cos_tensor=None,
     position_ids=None,
     use_neox_rotary_style=True,
+    **kwargs
 ):
     # permute q, k, v from [batch_size, seq_len, num_heads, head_dim]
     # to [batch_size, num_heads, seq_len, head_dim]
@@ -155,9 +156,9 @@ def paddle_fused_rotary_position_embedding(
     [
         (
             "qkv_input",
-            [2, 8, 2, 16],
-            [2, 8, 2, 16],
-            [2, 8, 2, 16],
+            [2, 8, 2, 16],  # bs, seq_len, num_heads, head_dim
+            [2, 8, 2, 16],  # bs, seq_len, num_heads, head_dim
+            [2, 8, 2, 16],  # bs, seq_len, num_heads, head_dim
             position_ids_list,
         ),
         ("qk_input", [2, 8, 2, 16], [2, 8, 2, 16], None, position_ids_list),
@@ -232,6 +233,7 @@ def get_forward_backward(
         with_sin_cos=True,
         use_neox_rotary_style=True,
         position_ids=None,
+        test_time_major=False,
     ):
         paddle.disable_static()
         fw = []
@@ -241,6 +243,15 @@ def get_forward_backward(
             seed, with_sin_cos
         )
 
+        if test_time_major:
+            # [batch_size, seq_len, num_heads, head_dim] -> [seq_len, batch_size, num_heads, head_dim]
+            if tensor_q is not None:
+                tensor_q = paddle.transpose(tensor_q, perm=[1, 0])
+            if tensor_k is not None:
+                tensor_k = paddle.transpose(tensor_k, perm=[1, 0])
+            if tensor_v is not None:
+                tensor_v = paddle.transpose(tensor_v, perm=[1, 0])
+
         out_q, out_k, out_v = rope_function(
             tensor_q,
             tensor_k,
@@ -249,6 +260,7 @@ def get_forward_backward(
             tensor_cos,
             position_ids=position_ids,
             use_neox_rotary_style=use_neox_rotary_style,
+            time_major=test_time_major,
         )
 
         out_init_grad = []
@@ -262,26 +274,42 @@ def get_forward_backward(
         bw = list(
             filter(lambda x: x is not None, [tensor_q, tensor_k, tensor_v])
         )
+
+        if test_time_major:
+            # transpose back
+            # [seq_len, batch_size, num_heads, head_dim] -> [batch_size, seq_len, num_heads, head_dim]
+            fw = [paddle.transpose(x, perm=[1, 0]) for x in fw]
+            bw = [paddle.transpose(x, perm=[1, 0]) for x in bw]
+
         return fw, bw
 
+    def check_results(self, p_results, f_results):
+        for i in range(len(p_results)):
+            np.testing.assert_allclose(
+                p_results[i].numpy(),
+                f_results[i].numpy(),
+                rtol=self.rtol,
+            )
+
     def test_fused_rope(self):
         p_fw, p_bw = self.get_forward_backward(
             paddle_fused_rotary_position_embedding, seed=self.seed
         )
         f_fw, f_bw = self.get_forward_backward(
-            fused_rotary_position_embedding, seed=self.seed
+            fused_rotary_position_embedding,
+            seed=self.seed,
+            test_time_major=False,
         )
-        for i in range(len(p_fw)):
-            np.testing.assert_allclose(
-                p_fw[i].numpy(),
-                f_fw[i].numpy(),
-                rtol=self.rtol,
-            )
-            np.testing.assert_allclose(
-                p_bw[i].numpy(),
-                f_bw[i].numpy(),
-                rtol=self.rtol,
-            )
+        f_fw_time_major, f_bw_time_major = self.get_forward_backward(
+            fused_rotary_position_embedding,
+            seed=self.seed,
+            test_time_major=True,
+        )
+
+        self.check_results(p_fw, f_fw)
+        self.check_results(p_bw, f_bw)
+        self.check_results(p_fw, f_fw_time_major)
+        self.check_results(p_bw, f_bw_time_major)
 
     def test_fused_rope_with_sin_cos(self):
         p_fw, p_bw = self.get_forward_backward(
@@ -293,18 +321,19 @@ def test_fused_rope_with_sin_cos(self):
             fused_rotary_position_embedding,
             seed=self.seed,
             with_sin_cos=True,
+            test_time_major=False,
         )
-        for i in range(len(p_fw)):
-            np.testing.assert_allclose(
-                p_fw[i].numpy(),
-                f_fw[i].numpy(),
-                rtol=self.rtol,
-            )
-            np.testing.assert_allclose(
-                p_bw[i].numpy(),
-                f_bw[i].numpy(),
-                rtol=self.rtol,
-            )
+        f_fw_time_major, f_bw_time_major = self.get_forward_backward(
+            fused_rotary_position_embedding,
+            seed=self.seed,
+            with_sin_cos=True,
+            test_time_major=True,
+        )
+
+        self.check_results(p_fw, f_fw)
+        self.check_results(p_bw, f_bw)
+        self.check_results(p_fw, f_fw_time_major)
+        self.check_results(p_bw, f_bw_time_major)
 
     def test_fused_rope_rotate_half(self):
         p_fw, p_bw = self.get_forward_backward(
@@ -316,18 +345,19 @@ def test_fused_rope_rotate_half(self):
             fused_rotary_position_embedding,
             seed=self.seed,
             use_neox_rotary_style=False,
+            test_time_major=False,
+        )
+        f_fw_time_major, f_bw_time_major = self.get_forward_backward(
+            fused_rotary_position_embedding,
+            seed=self.seed,
+            use_neox_rotary_style=False,
+            test_time_major=True,
         )
-        for i in range(len(p_fw)):
-            np.testing.assert_allclose(
-                p_fw[i].numpy(),
-                f_fw[i].numpy(),
-                rtol=self.rtol,
-            )
-            np.testing.assert_allclose(
-                p_bw[i].numpy(),
-                f_bw[i].numpy(),
-                rtol=self.rtol,
-            )
+
+        self.check_results(p_fw, f_fw)
+        self.check_results(p_bw, f_bw)
+        self.check_results(p_fw, f_fw_time_major)
+        self.check_results(p_bw, f_bw_time_major)
 
     def test_fused_rope_position_ids(self):
         position_ids = paddle.to_tensor(self.position_ids_list)
@@ -340,18 +370,19 @@ def test_fused_rope_position_ids(self):
             fused_rotary_position_embedding,
             seed=self.seed,
             position_ids=position_ids,
+            test_time_major=False,
         )
-        for i in range(len(p_fw)):
-            np.testing.assert_allclose(
-                p_fw[i].numpy(),
-                f_fw[i].numpy(),
-                rtol=self.rtol,
-            )
-            np.testing.assert_allclose(
-                p_bw[i].numpy(),
-                f_bw[i].numpy(),
-                rtol=self.rtol,
-            )
+        f_fw_time_major, f_bw_time_major = self.get_forward_backward(
+            fused_rotary_position_embedding,
+            seed=self.seed,
+            position_ids=position_ids,
+            test_time_major=True,
+        )
+
+        self.check_results(p_fw, f_fw)
+        self.check_results(p_bw, f_bw)
+        self.check_results(p_fw, f_fw_time_major)
+        self.check_results(p_bw, f_bw_time_major)
 
     @test_with_pir_api
     def test_static(self):
@@ -448,6 +479,120 @@ def test_static(self):
             )
         paddle.disable_static()
 
+    @test_with_pir_api
+    def test_static_time_major(self):
+        paddle.disable_static()
+        tensor_q, tensor_k, tensor_v, tensor_sin, tensor_cos = self.get_inputs(
+            self.seed, True
+        )
+        p_fw, p_bw = self.get_forward_backward(
+            paddle_fused_rotary_position_embedding,
+            seed=self.seed,
+            use_neox_rotary_style=False,
+            test_time_major=False,
+        )
+
+        paddle.enable_static()
+
+        shape_q = (
+            [self.shape_q[1], self.shape_q[0], self.shape_q[2], self.shape_q[3]]
+            if self.shape_q
+            else None
+        )
+        shape_k = (
+            [self.shape_k[1], self.shape_k[0], self.shape_k[2], self.shape_k[3]]
+            if self.shape_k
+            else None
+        )
+        shape_v = (
+            [self.shape_v[1], self.shape_v[0], self.shape_v[2], self.shape_v[3]]
+            if self.shape_v
+            else None
+        )
+
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            q = (
+                None
+                if shape_q is None
+                else paddle.static.data(
+                    name="q", shape=shape_q, dtype=self.dtype
+                )
+            )
+
+            k = (
+                None
+                if shape_k is None
+                else paddle.static.data(
+                    name="k", shape=shape_k, dtype=self.dtype
+                )
+            )
+
+            v = (
+                None
+                if shape_v is None
+                else paddle.static.data(
+                    name="v", shape=shape_v, dtype=self.dtype
+                )
+            )
+
+            sin = paddle.static.data(
+                name="sin",
+                shape=(1, shape_q[0], 1, shape_q[3]),
+                dtype=self.dtype,
+            )
+            cos = paddle.static.data(
+                name="cos",
+                shape=(1, shape_q[0], 1, shape_q[3]),
+                dtype=self.dtype,
+            )
+
+            out_q, out_k, out_v = fused_rotary_position_embedding(
+                q,
+                k,
+                v,
+                sin,
+                cos,
+                position_ids=None,
+                use_neox_rotary_style=False,
+                time_major=True,
+            )
+
+        exe = paddle.static.Executor()
+
+        feed = {
+            'sin': tensor_sin.numpy(),
+            'cos': tensor_cos.numpy(),
+        }
+        for var_name, input_tensor in zip(
+            ['q', 'k', 'v'], [tensor_q, tensor_k, tensor_v]
+        ):
+            if input_tensor is not None:
+                feed[var_name] = input_tensor.numpy().transpose((1, 0, 2, 3))
+
+        fetch_list = []
+        for x, out in zip([q, k, v], [out_q, out_k, out_v]):
+            # The reason why fetch `out` based on `x` is that
+            # if input is None, the output of static function might be not NoneType
+            # but pir.Value with type pd_op.tensor<0xf32> in pir mode.
+            if x is not None:
+                fetch_list.append(out)
+
+        outs = exe.run(
+            main,
+            feed=feed,
+            fetch_list=fetch_list,
+        )
+
+        for i in range(len(p_fw)):
+            np.testing.assert_allclose(
+                p_fw[i].numpy(),
+                outs[i].transpose((1, 0, 2, 3)),
+                rtol=self.rtol,
+            )
+        paddle.disable_static()
+
 
 if __name__ == '__main__':
     unittest.main()

From 957b1dd229ce35f0d38fbc3d1fe5b994c85f536c Mon Sep 17 00:00:00 2001
From: Lu Qi <61354321+MarioLulab@users.noreply.github.com>
Date: Thu, 22 Feb 2024 20:55:43 +0800
Subject: [PATCH 03/82] fix (#61906)

---
 python/paddle/distributed/fleet/base/topology.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index f0e4ff20c2973..3b5a590ae32e2 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -191,12 +191,13 @@ def __init__(self, topology):
 
         assert (
             self._check_valid_topo()
-        ), "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}, sep_num: {}".format(
+        ), "nranks: {}, mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}, sep_num: {}".format(
             self.nranks,
             self._mp_degree,
             self._sharding_degree,
             self._pp_degree,
             self._dp_degree,
+            self._sep_degree,
         )
 
         # create comm group for pipe parallel

From 60902c7737a1be556f73fe03a6181650994f5db4 Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Thu, 22 Feb 2024 21:12:37 +0800
Subject: [PATCH 04/82] [Prim][PIR] add index_sample op forward prim (#61825)

* add index_sample decomp

* index_sample support dynamic shape

* update code

* update code
---
 .../decomp_interface_gen_op_list.py           |  2 +
 .../pir/dialect/op_generator/op_build_gen.py  |  1 +
 .../manual/manual_eager_prim_backend.cc       | 10 +++
 .../backend/manual/manual_prim_backend.h      |  7 ++
 .../manual/manual_static_prim_backend.cc      | 18 +++++
 paddle/fluid/primitive/composite/composite.h  | 25 +++++++
 test/legacy_test/test_index_sample_op.py      | 10 ++-
 .../test_prim_sub_graph_dynamic_shape.py      | 71 +++++++++++++++++++
 8 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
index dd99c6c7212e8..b40e8b4d3dea2 100644
--- a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
@@ -30,6 +30,7 @@
     "gelu",
     "hardswish",
     "group_norm",
+    "index_sample",
     "index_select",
     "instance_norm",
     "layer_norm",
@@ -61,6 +62,7 @@
     "gelu",
     "hardswish",
     "group_norm",
+    "index_sample",
     "index_select",
     "instance_norm",
     "layer_norm",
diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index d9828f7752719..7b079605a2460 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -27,6 +27,7 @@
     'InterpolateInferMeta',
     'DeformableConvInferMeta',
     'MatrixNMSInferMeta',
+    'IndexSampleInferMeta',
 }
 
 _PREPARE_DATA_WITH_VECTOR_INT64_MTTABLE_ATTRIBUTE = {'FrobeniusNormOp'}
diff --git a/paddle/fluid/primitive/backend/manual/manual_eager_prim_backend.cc b/paddle/fluid/primitive/backend/manual/manual_eager_prim_backend.cc
index ede51d43be1a6..0a71b3f8e47d4 100644
--- a/paddle/fluid/primitive/backend/manual/manual_eager_prim_backend.cc
+++ b/paddle/fluid/primitive/backend/manual/manual_eager_prim_backend.cc
@@ -35,6 +35,16 @@ Tensor full<Tensor>(const IntArray& shape,
   }
 }
 
+template <>
+Tensor arange_with_tensor<Tensor>(const Tensor& start,
+                                  const Tensor& end,
+                                  const Tensor& step,
+                                  DataType dtype,
+                                  Place place) {
+  VLOG(4) << "Eager Prim API arange_ad_func call";
+  return ::arange_ad_func(start, end, step, dtype, place);
+}
+
 }  // namespace backend
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/backend/manual/manual_prim_backend.h b/paddle/fluid/primitive/backend/manual/manual_prim_backend.h
index fce33b08f0dff..faf22e5f9807c 100644
--- a/paddle/fluid/primitive/backend/manual/manual_prim_backend.h
+++ b/paddle/fluid/primitive/backend/manual/manual_prim_backend.h
@@ -41,6 +41,13 @@ Tensor reshape_with_tensor(const Tensor& x, const Tensor& shape);
 template <typename T>
 Tensor expand_with_tensor(const Tensor& x, const Tensor& shape);
 
+template <typename T>
+Tensor arange_with_tensor(const Tensor& start,
+                          const Tensor& end,
+                          const Tensor& step,
+                          DataType dtype = DataType::FLOAT64,
+                          Place place = CPUPlace());
+
 }  // namespace backend
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/backend/manual/manual_static_prim_backend.cc b/paddle/fluid/primitive/backend/manual/manual_static_prim_backend.cc
index acaa143ba811f..a79e929a6e5cc 100644
--- a/paddle/fluid/primitive/backend/manual/manual_static_prim_backend.cc
+++ b/paddle/fluid/primitive/backend/manual/manual_static_prim_backend.cc
@@ -68,6 +68,24 @@ Tensor expand_with_tensor<LazyTensor>(const Tensor& x, const Tensor& shape) {
   return out;
 }
 
+template <>
+Tensor arange_with_tensor<LazyTensor>(const Tensor& start,
+                                      const Tensor& end,
+                                      const Tensor& step,
+                                      DataType dtype,
+                                      Place place) {
+  pir::Value start_val =
+      std::static_pointer_cast<LazyTensor>(start.impl())->value();
+  pir::Value end_val =
+      std::static_pointer_cast<LazyTensor>(end.impl())->value();
+  pir::Value step_val =
+      std::static_pointer_cast<LazyTensor>(step.impl())->value();
+  auto op_res =
+      paddle::dialect::arange(start_val, end_val, step_val, dtype, place);
+  Tensor out(std::make_shared<LazyTensor>(op_res));
+  return out;
+}
+
 }  // namespace backend
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index 3b81af8530c09..28983fa3cfd63 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -1001,6 +1001,31 @@ Tensor embedding_decomp(const Tensor& x,
   return res;
 }
 
+template <typename T>
+Tensor index_sample_decomp(const Tensor& x, const Tensor& index) {
+  std::vector<int64_t> tmp_shape{-1, 1};
+  auto index_dim = get_slice<T>(shape<T>(index), 0);
+  auto start =
+      backend::full_with_tensor<T>(shape<T>(index_dim), 0, index_dim.dtype());
+  auto step =
+      backend::full_with_tensor<T>(shape<T>(index_dim), 1, index_dim.dtype());
+  auto arange_tmp = reshape<T>(
+      backend::arange_with_tensor<T>(start, index_dim, step, index.dtype()),
+      tmp_shape);
+
+  auto index_res = reshape<T>(
+      backend::expand_with_tensor<T>(arange_tmp, shape<T>(index)), tmp_shape);
+  auto index_ = reshape<T>(index, tmp_shape);
+  auto concat_res = concat<T>({index_res, index_}, 1);
+  auto res = backend::reshape<T>(gather_nd<T>(x, concat_res), shape<T>(index));
+
+  if (res.dtype() != x.dtype()) {
+    return cast<T>(res, x.dtype());
+  } else {
+    return res;
+  }
+}
+
 }  // namespace details
 
 }  // namespace primitive
diff --git a/test/legacy_test/test_index_sample_op.py b/test/legacy_test/test_index_sample_op.py
index 674c45627b02c..854bf7179b8cb 100755
--- a/test/legacy_test/test_index_sample_op.py
+++ b/test/legacy_test/test_index_sample_op.py
@@ -26,7 +26,9 @@
 class TestIndexSampleOp(OpTest):
     def setUp(self):
         self.op_type = "index_sample"
+        self.prim_op_type = "comp"
         self.python_api = paddle.index_sample
+        self.public_python_api = paddle.index_sample
         self.config()
         xnp = np.random.random(self.x_shape).astype(self.x_type)
         if self.x_type == np.complex64 or self.x_type == np.complex128:
@@ -47,7 +49,7 @@ def setUp(self):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output(check_pir=True)
+        self.check_output(check_pir=True, check_prim_pir=True)
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out', check_pir=True)
@@ -158,7 +160,9 @@ def config(self):
 class TestIndexSampleBF16Op(OpTest):
     def setUp(self):
         self.op_type = "index_sample"
+        self.prim_op_type = "comp"
         self.python_api = paddle.index_sample
+        self.public_python_api = paddle.index_sample
         self.config()
         xnp = np.random.random(self.x_shape).astype(self.x_type)
         indexnp = np.random.randint(
@@ -177,7 +181,9 @@ def setUp(self):
         self.place = core.CUDAPlace(0)
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_pir=True)
+        self.check_output_with_place(
+            self.place, check_pir=True, check_prim_pir=True
+        )
 
     def test_check_grad(self):
         self.check_grad_with_place(self.place, ['X'], 'Out', check_pir=True)
diff --git a/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
index 0c0698ef8a311..6be76dd54af38 100644
--- a/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
+++ b/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
@@ -80,6 +80,10 @@ def tile_net2(x):
     return y
 
 
+def index_sample_net(x, index):
+    return paddle.index_sample(x, index)
+
+
 class TestPrimOne(unittest.TestCase):
     def setUp(self):
         np.random.seed(2023)
@@ -198,5 +202,72 @@ def setUp(self):
         self.enable_cinn = False
 
 
+class TestPrimTwo(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [300, 4096]
+        self.shape_y = [300, 2048]
+        self.dtype_x = "float32"
+        self.dtype_y = int
+        self.init_x_shape = [None, 4096]
+        self.init_y_shape = [None, 2048]
+        self.x = np.random.random(self.shape_x).astype(self.dtype_x)
+        self.y = np.random.random(self.shape_y).astype(self.dtype_y)
+        self.net = index_sample_net
+        self.necessary_ops = "pd_op.index_sample"
+        self.enable_cinn = False
+
+    def base_net(self, flag=None):
+        x = paddle.to_tensor(self.x)
+        y = paddle.to_tensor(self.y)
+        if flag == "prim":
+            core._set_prim_all_enabled(True)
+            fn = apply_to_static(
+                self.net,
+                use_cinn=self.enable_cinn,
+                input_spec=[
+                    InputSpec(shape=self.init_x_shape, dtype=self.dtype_x),
+                    InputSpec(shape=self.init_y_shape, dtype=self.dtype_y),
+                ],
+            )
+            fn.eval()
+        else:
+            fn = self.net
+        res = fn(x, y)
+
+        if flag == "prim":
+            ops = [
+                op.name()
+                for op in fn.program_cache.last()[-1][-1]
+                .infer_program.program.global_block()
+                .ops
+            ]
+            assert self.necessary_ops not in ops
+            core._set_prim_all_enabled(False)
+        return res
+
+    def test_prim_all_dynamic(self):
+        res_ref = self.base_net()
+        res = self.base_net("prim")
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_allclose(ref, actual, rtol=1e-6)
+
+
+class TestPrimTwoIndexSample(TestPrimTwo):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [300, 4096]
+        self.shape_y = [300, 2048]
+        self.dtype_x = "float32"
+        self.dtype_y = int
+        self.init_x_shape = [None, 4096]
+        self.init_y_shape = [300, 2048]
+        self.x = np.random.random(self.shape_x).astype(self.dtype_x)
+        self.y = np.random.random(self.shape_y).astype(self.dtype_y)
+        self.net = index_sample_net
+        self.necessary_ops = "pd_op.index_sample"
+        self.enable_cinn = False
+
+
 if __name__ == "__main__":
     unittest.main()

From ba94e24d91e84af2983a381674628e2f37df7039 Mon Sep 17 00:00:00 2001
From: Kunbo Ding <kunbo_ding@163.com>
Date: Thu, 22 Feb 2024 21:49:23 +0800
Subject: [PATCH 05/82] fix dataloaer for toolkit (#61867)

---
 python/paddle/io/dataloader/dataloader_iter.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py
index 101fbdb753881..aaa2eae2a7864 100644
--- a/python/paddle/io/dataloader/dataloader_iter.py
+++ b/python/paddle/io/dataloader/dataloader_iter.py
@@ -704,10 +704,11 @@ def _get_data(self):
                 if len(failed_workers) > 0:
                     self._exit_thread_unexpectedly()
                     pids = ', '.join(str(w.pid) for w in failed_workers)
-                    raise RuntimeError(
-                        f"DataLoader {len(failed_workers)} workers exit unexpectedly, "
-                        f"pids: {pids}"
+                    logging.warning(
+                        "DataLoader {} workers exit unexpectedly, "
+                        "pids: {}".format(len(failed_workers), pids)
                     )
+                    return
 
                 # get(timeout) will call _poll(timeout) and may raise IOError
                 if isinstance(e, (IOError, queue.Empty)):

From d7c5cf5f1b0482bc4e8e7b93e553f72c97f824fe Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Fri, 23 Feb 2024 09:02:57 +0800
Subject: [PATCH 06/82] [Einsum] einsum support broadcast and ...  (#61348)

---
 paddle/phi/infermeta/unary.cc              |   6 +-
 paddle/phi/kernels/cpu/tile_grad_kernel.cc |   2 +
 paddle/phi/kernels/gpu/tile_grad_kernel.cu |   2 +
 paddle/phi/kernels/impl/einsum_grad_impl.h | 164 +++++------
 paddle/phi/kernels/impl/einsum_impl.h      | 299 +++++++--------------
 python/paddle/tensor/einsum.py             |  40 ++-
 test/legacy_test/test_einsum_op.py         |  59 +++-
 test/xpu/test_einsum_op_xpu.py             |  97 ++++---
 8 files changed, 356 insertions(+), 313 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 611b5239dccdf..5092072f5a87c 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1152,9 +1152,8 @@ void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
   LabelMap labeltype(LabelType::Reduction);
   std::vector<LabelMap> label2perms(inputs.size(), LabelMap(-1));
   std::vector<char> all_labels;
-  std::vector<int> broadcast_dims;
   std::vector<int> output_dims;
-  std::vector<std::vector<int>> ellipsis_dims(2);
+  std::vector<std::vector<int>> broadcast_shapes(2);
 
   std::vector<DDim> input_dims;
   for (auto& i : inputs) {
@@ -1168,8 +1167,7 @@ void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                       &labeltype,
                       &all_labels,
                       &label2perms,
-                      &ellipsis_dims,
-                      &broadcast_dims,
+                      &broadcast_shapes,
                       &output_dims,
                       &right,
                       &input_strs);
diff --git a/paddle/phi/kernels/cpu/tile_grad_kernel.cc b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
index 636ade93742da..ed6bc49ed8645 100644
--- a/paddle/phi/kernels/cpu/tile_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/tile_grad_kernel.cc
@@ -26,4 +26,6 @@ PD_REGISTER_KERNEL(tile_grad,
                    float,
                    double,
                    int,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
                    int64_t) {}
diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
index d1e356df401a8..7817d42d031bc 100644
--- a/paddle/phi/kernels/gpu/tile_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
@@ -28,4 +28,6 @@ PD_REGISTER_KERNEL(tile_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
                    phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index e9623d9f2caed..7652e5e8a9a99 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -18,6 +18,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
+#include "paddle/phi/kernels/tile_grad_kernel.h"
 #include "paddle/phi/kernels/tile_kernel.h"
 #include "paddle/utils/string/string_helper.h"
 
@@ -27,38 +28,30 @@ template <typename T, typename Context>
 DenseTensor PerformTileAndReduction(const Context& dev_ctx,
                                     const LabelMap& label2type,
                                     const LabelMap& label2shape,
-                                    const std::vector<int>& broadcast_dims,
-                                    const std::vector<int>& ellipsis_dims,
+                                    const std::vector<int>& broadcast_shape,
+                                    const std::vector<int> x_shape,
                                     std::string equ,   // value pass
                                     DenseTensor& t) {  // NOLINT
   auto tmp_label = equ;
-  ReplaceEllipsis(tmp_label);
   auto tmp_union = unique_labels(tmp_label);
   auto op_label = std::string(tmp_union.begin(), tmp_union.end());
-  VLOG(5) << "Start PerformTileAndReduction" << equ;
+  VLOG(5) << "Start PerformTileAndReduction equation " << equ
+          << " with operand shape: "
+          << paddle::string::join_strings(common::vectorize<int>(t.dims()),
+                                          ",");
   DenseTensor ret;
   std::vector<int> repeat_times;
   std::vector<int> resize_dims;
   std::vector<int> recover_shape;
   for (int c : op_label) {
     if (label2type[c] == LabelType::Reduction) {
-      // '.' can't be Reduction, so we don't deal '.' here.
       repeat_times.push_back(label2shape[c]);
       resize_dims.push_back(1);
       recover_shape.push_back(label2shape[c]);
     } else {
-      if (c != '.') {
-        resize_dims.push_back(label2shape[c]);
-        repeat_times.push_back(1);
-        recover_shape.push_back(label2shape[c]);
-      } else {
-        int n_dims = broadcast_dims.size();
-        resize_dims.insert(
-            resize_dims.end(), broadcast_dims.begin(), broadcast_dims.end());
-        recover_shape.insert(
-            recover_shape.end(), ellipsis_dims.begin(), ellipsis_dims.end());
-        while (n_dims--) repeat_times.push_back(1);
-      }
+      resize_dims.push_back(label2shape[c]);
+      repeat_times.push_back(1);
+      recover_shape.push_back(label2shape[c]);
     }
   }
   t.Resize(common::make_ddim(resize_dims));
@@ -72,40 +65,47 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
             << paddle::string::join_strings(repeat_times, ",");
     TileKernel<T, Context>(dev_ctx, t, repeat_times, &after_tile);
   }
-  size_t n_ellipsis_idx = op_label.find(".", 0);
-  if (n_ellipsis_idx != std::string::npos) {
-    // may be we need reduce. broadcast_dims is not equal to ellipsis dims.
-    std::vector<int64_t> to_reduce;
-    for (size_t i = 0; i < broadcast_dims.size() - ellipsis_dims.size(); ++i)
-      to_reduce.push_back(i + n_ellipsis_idx);
-
-    int new_offset =
-        n_ellipsis_idx + broadcast_dims.size() - ellipsis_dims.size();
-    for (size_t i = 0; i < ellipsis_dims.size(); ++i)
-      if (ellipsis_dims[i] == 1) to_reduce.push_back(i + new_offset);
-
-    VLOG(5) << "PerformTileAndReduction: reduce sum axis: "
-            << paddle::string::join_strings(to_reduce, ",");
-    if (to_reduce.size() != 0) {
-      ret = Sum<T, Context>(dev_ctx,
-                            after_tile,
-                            phi::IntArray(to_reduce),
-                            after_tile.dtype(),
-                            false);  // not keep dim.
-    } else {
-      ret = after_tile;
-    }
-  } else {
-    ret = after_tile;
-  }
-  VLOG(5) << "PerformTileAndReduction: recover shape: "
+  ret = after_tile;
+  VLOG(5) << "PermformTileAndReduction: recover shape: "
           << paddle::string::join_strings(recover_shape, ",");
   ret.Resize(common::make_ddim(recover_shape));
+
   // undiagonalize by einsum equation. only contain undiagonal operations.
-  DenseTensor out;
-  VLOG(5) << "Undiagonal by einsum with args: " << op_label + "->" + equ;
-  EinsumInferKernel<T, Context>(dev_ctx, {&ret}, op_label + "->" + equ, &out);
-  return out;
+  DenseTensor undiagonal_out;
+  if (op_label != equ) {
+    VLOG(5) << "Undiagonal by einsum with args: " << op_label + "->" + equ;
+    EinsumInferKernel<T, Context>(
+        dev_ctx, {&ret}, op_label + "->" + equ, &undiagonal_out);
+  } else {
+    undiagonal_out = ret;
+  }
+
+  // call TileGradKernel to reverse broadcast operation.
+  VLOG(5) << "After diagonalize, we have tensor with shape: "
+          << paddle::string::join_strings(
+                 common::vectorize<int>(undiagonal_out.dims()), ',');
+  repeat_times.clear();
+  for (size_t i = 0; i < x_shape.size(); ++i) {
+    VLOG(4) << "broadcast shape is " << broadcast_shape[i] << ", x_shape is "
+            << x_shape[i];
+    repeat_times.push_back(broadcast_shape[i] / x_shape[i]);
+  }
+  bool is_all_ones = std::all_of(
+      repeat_times.begin(), repeat_times.end(), [](int x) { return x == 1; });
+  if (is_all_ones) {
+    VLOG(4) << "don't need broadcast recover, we just return undiagonal_out.";
+    return undiagonal_out;
+  }
+  DenseTensor tmp_x;
+  DenseTensor broadcast_out;
+  tmp_x.Resize(common::make_ddim(x_shape));
+  broadcast_out.Resize(common::make_ddim(x_shape));
+  TileGradKernel<T, Context>(
+      dev_ctx, tmp_x, undiagonal_out, repeat_times, &broadcast_out);
+  VLOG(5) << "After broadcast recover, we have tensor with shape: "
+          << paddle::string::join_strings(
+                 common::vectorize<int>(broadcast_out.dims()), ',');
+  return broadcast_out;
 }
 
 template <typename T, typename Context>
@@ -120,8 +120,7 @@ void EinsumGradKernel(const Context& dev_ctx,
   LabelMap labeltype(LabelType::Reduction);
   std::vector<LabelMap> label2perms(x.size(), LabelMap(-1));
   std::vector<char> all_labels;  // order: ABO, AO, BO, AB, Reduce
-  std::vector<std::vector<int>> ellipsis_dims(2);
-  std::vector<int> broadcast_dims;
+  std::vector<std::vector<int>> broadcast_shapes(2);
   std::vector<int> output_dims;
 
   std::vector<DDim> input_dims;
@@ -136,12 +135,13 @@ void EinsumGradKernel(const Context& dev_ctx,
                       &labeltype,
                       &all_labels,
                       &label2perms,
-                      &ellipsis_dims,
-                      &broadcast_dims,
+                      &broadcast_shapes,
                       &output_dims,
                       &right,
                       &input_strs);
 
+  VLOG(4) << "After grad parse einsum equation.";
+
   auto gather_labels_except_reduction = [&labeltype](std::string all) {
     std::string res("");
     for (auto c : all)
@@ -160,13 +160,17 @@ void EinsumGradKernel(const Context& dev_ctx,
     VLOG(5) << "new_equation is " << new_equation;
     EinsumInferKernel<T, Context>(
         dev_ctx, new_operands, new_equation, &before_tile);
-    *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[0],
-                                                       left,
-                                                       before_tile);
+    *(x_grad[0]) = PerformTileAndReduction<T, Context>(
+        dev_ctx,
+        labeltype,
+        labelshape,
+        broadcast_shapes[0],
+        common::vectorize<int>(x[0]->dims()),
+        left,
+        before_tile);
+#ifndef PADDLE_WITH_XPU  // xpu is not support conj now, we just disable it.
+    *(x_grad[0]) = Conj<T, Context>(dev_ctx, *x_grad[0]);
+#endif
   } else {
     auto splits = paddle::string::split_string(equation, "->");
     auto left = splits[0];
@@ -179,7 +183,11 @@ void EinsumGradKernel(const Context& dev_ctx,
     auto operands_for_A = std::vector<const DenseTensor*>();
     auto operands_for_B = std::vector<const DenseTensor*>();
     DenseTensor dA, dB;
+#ifndef PADDLE_WITH_XPU  // xpu is not support conj now, we just disable it.
     auto out_grad_conj = Conj<T, Context>(dev_ctx, out_grad);
+#else
+    auto out_grad_conj = out_grad;
+#endif
     // dA = einsum(B, dC)
     operands_for_A.push_back(x[1]);
     operands_for_A.push_back(&out_grad_conj);
@@ -187,8 +195,6 @@ void EinsumGradKernel(const Context& dev_ctx,
     operands_for_B.push_back(&out_grad_conj);
     operands_for_B.push_back(x[0]);
 
-    DenseTensor before_tile;
-
     std::vector<DenseTensor> cache(3);  // set empty; TA, TB, TdC
     if (inner_cache.size() >
         0) {  // for compatibility,  we can load and run v2.3 EinsumOp.
@@ -215,24 +221,32 @@ void EinsumGradKernel(const Context& dev_ctx,
     // now.
     cache.clear();
     if (x_grad[0]) {
-      *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                         labeltype,
-                                                         labelshape,
-                                                         broadcast_dims,
-                                                         ellipsis_dims[0],
-                                                         ops[0],
-                                                         dA);
+      *(x_grad[0]) = PerformTileAndReduction<T, Context>(
+          dev_ctx,
+          labeltype,
+          labelshape,
+          broadcast_shapes[0],
+          common::vectorize<int>(x[0]->dims()),
+          ops[0],
+          dA);
+      VLOG(4) << "After call dA";
+#ifndef PADDLE_WITH_XPU  // xpu is not support conj now, we just disable it.
       *(x_grad[0]) = Conj<T, Context>(dev_ctx, *x_grad[0]);
+#endif
     }
     if (x_grad[1]) {
-      *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                         labeltype,
-                                                         labelshape,
-                                                         broadcast_dims,
-                                                         ellipsis_dims[1],
-                                                         ops[1],
-                                                         dB);
+      *(x_grad[1]) = PerformTileAndReduction<T, Context>(
+          dev_ctx,
+          labeltype,
+          labelshape,
+          broadcast_shapes[1],
+          common::vectorize<int>(x[1]->dims()),
+          ops[1],
+          dB);
+#ifndef PADDLE_WITH_XPU  // xpu is not support conj now, we just disable it.
       *(x_grad[1]) = Conj<T, Context>(dev_ctx, *x_grad[1]);
+#endif
+      VLOG(4) << "After call dA";
     }
   }
 }
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 6065b1e37b075..89e61eb936cbe 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -24,6 +24,7 @@
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
+#include "paddle/phi/kernels/tile_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/utils/string/string_helper.h"
 
@@ -46,11 +47,11 @@ inline static void ValidationCheck(const std::string& equation) {
   size_t pos;
   auto trimed_equ = equation;
   if ((pos = trimed_equ.find("->", 0)) != std::string::npos) {
-    trimed_equ.replace(pos, 2, ".");
+    trimed_equ.replace(pos, 2, "");
   }
   auto is_valid_char = [](char c) {
     if (c >= 'a' && c <= 'z') return true;
-    if (c == '.' || c == ',') return true;
+    if (c == ',') return true;
     return false;
   };
   for (auto c : trimed_equ) {
@@ -81,16 +82,14 @@ class LabelMap {
  public:
   explicit LabelMap(int default_value = 0) {
     this->default_value = default_value;
-    for (int i = 0; i < N; ++i) map[i] = default_value;
+    for (size_t i = 0; i < N; ++i) map[i] = default_value;
   }
   int& operator[](int label) {
     int i = label - 'a';
-    if (label == '.') i = N - 1;
     return map[i];
   }
   int operator[](int label) const {
     int i = label - 'a';
-    if (label == '.') i = N - 1;
     return map[i];
   }
   bool exist(char label) { return !is_default(label); }
@@ -113,17 +112,6 @@ inline std::string label_to_string(const std::vector<char>& all_labels,
   return str;
 }
 
-inline static void ReplaceEllipsis(std::string& s) {  // NOLINT
-  size_t pos;
-  if ((pos = s.find("...", 0)) != std::string::npos) {
-    s.replace(pos, 3, ".");
-  }
-  // remove all the space in the expression
-  while ((pos = s.find(" ", 0)) != std::string::npos) {
-    s.replace(pos, 1, "");
-  }
-}
-
 template <typename CharIterable1, typename CharIterable2>
 inline std::vector<char> union_labels(const CharIterable1& a,
                                       const CharIterable2& b) {
@@ -191,8 +179,6 @@ inline static void GlobalInfo(const std::vector<std::string>& op_labels,
     if ((*label2type)[c] == LabelType::BO) (*label2type)[c] = LabelType::AO;
   }
 
-  (*label2type)['.'] = LabelType::Batch;
-
   if (sorted_labels->size()) {
     std::set<char> exist(all.begin(), all.end());
     all.clear();
@@ -210,107 +196,61 @@ inline static void GlobalInfo(const std::vector<std::string>& op_labels,
                                          LabelType::Contraction,
                                          LabelType::Reduction});
 
-  if (counter[static_cast<int>('.')] > 0) {
-    std::vector<char> tmp;
-    tmp.push_back('.');
-    // push '.' in the front
-    *sorted_labels = union_labels(tmp, *sorted_labels);
-  }
   VLOG(5) << "GlobalInfo: sorted_labels after: "
           << paddle::string::join_strings(*sorted_labels, ",");
 }
 
-inline static void InferLabelShape(const std::vector<std::string>& op_labels,
-                                   const std::vector<DDim>& inputs,
-                                   LabelMap* labelshape,
-                                   std::vector<std::vector<int>>* ellipsis_dims,
-                                   std::vector<int>* broadcast_dims) {
+inline static void InferLabelShape(
+    const std::vector<std::string>& op_labels,
+    const std::vector<DDim>& inputs,
+    LabelMap* labelshape,
+    std::vector<std::vector<int>>* broadcast_shapes) {
   VLOG(5) << "Start InferLabelShape";
-  int n_broadcast_dims = 0;
-  for (size_t i = 0; i < op_labels.size(); ++i) {
-    VLOG(5) << "oplabels: " << op_labels[i];
-    int valid_indices = std::count_if(op_labels[i].begin(),
-                                      op_labels[i].end(),
-                                      [](char c) { return c != '.'; });
-    int n_ellipsis = inputs[i].size() - valid_indices;
-    VLOG(5) << "valid indices and n_ellipsis: " << valid_indices << " "
-            << n_ellipsis;
-    ellipsis_dims->at(i).resize(n_ellipsis);
-    n_broadcast_dims = std::max(n_broadcast_dims, n_ellipsis);
-  }
-  VLOG(5) << "InferLabelShape: Broadcast ndims:" << n_broadcast_dims;
-  *broadcast_dims = std::vector<int>(n_broadcast_dims, 1);
-
   for (size_t i = 0; i < op_labels.size(); ++i) {
     auto& op_str = op_labels[i];
     auto& op_dim = inputs[i];
     int dim_ptr = 0;
-    for (int c : op_str) {
-      if (c == '.') {
-        for (auto& v : ellipsis_dims->at(i)) {
-          v = op_dim[dim_ptr];
-          dim_ptr++;
-        }
-      } else if (!labelshape->exist(c) || (*labelshape)[c] == -1) {
-        (*labelshape)[c] = op_dim[dim_ptr];
-        dim_ptr++;
-      } else if (op_dim[dim_ptr] != -1) {
+    for (auto& c : op_str) {
+      if (!labelshape->exist(c) || abs((*labelshape)[c]) == 1) {
+        (*labelshape)[c] = static_cast<int>(op_dim[dim_ptr]);
+      } else if (abs(op_dim[dim_ptr]) != 1) {
         PADDLE_ENFORCE_EQ(
             (*labelshape)[c],
             op_dim[dim_ptr],
             phi::errors::InvalidArgument(
                 "Same label have different shapes for label: `%c`", c));
-        dim_ptr++;
       }
+      dim_ptr++;
     }
   }
   for (size_t i = 0; i < op_labels.size(); ++i) {
-    VLOG(5) << "InferLabelShape: Ellipsis ndims:"
-            << paddle::string::join_strings(ellipsis_dims->at(i), ",");
-    int idx = n_broadcast_dims - ellipsis_dims->at(i).size();
-    for (auto v : ellipsis_dims->at(i)) {
-      PADDLE_ENFORCE_EQ(
-          v == 1 || broadcast_dims->at(idx) == 1 ||
-              broadcast_dims->at(idx) == v,
-          true,
-          phi::errors::InvalidArgument(
-              "Ellipsis dims can't broadcasts. Please Check you operands."));
-      broadcast_dims->at(idx) = std::max(v, broadcast_dims->at(idx));
-      idx += 1;
+    for (auto& c : op_labels[i]) {
+      (*broadcast_shapes)[i].push_back((*labelshape)[c]);
     }
   }
-  VLOG(5) << "InferLabelShape: Broadcast dims:"
-          << paddle::string::join_strings(*broadcast_dims, ",");
+  for (size_t i = 0; i < op_labels.size(); ++i) {
+    VLOG(5) << "InferLabelShape: After broadcast shape is:"
+            << paddle::string::join_strings((*broadcast_shapes)[i], ",");
+  }
 }
 
 template <class CharIterable>
 inline static void InferLabelPerm(const CharIterable& op,
-                                  int n_broadcast,
                                   LabelMap* label2perm) {
   int cur = 0;
   for (int c : op) {
     if (!label2perm->exist(
             c))  // can appear repeatedly. we just record the first position.
       (*label2perm)[c] = cur;
-    if (c == '.') {
-      cur += n_broadcast;
-    } else {
-      cur += 1;
-    }
+    cur += 1;
   }
 }
 
 inline static void InferOutputDims(const std::string& right,
-                                   const std::vector<int>& broadcast_dims,
                                    const LabelMap& labelshape,
                                    std::vector<int>* output_dims) {
   for (int c : right) {
-    if (c == '.') {
-      output_dims->insert(
-          output_dims->end(), broadcast_dims.begin(), broadcast_dims.end());
-    } else {
-      output_dims->push_back(labelshape[c]);
-    }
+    output_dims->push_back(labelshape[c]);
   }
 }
 //
@@ -321,31 +261,26 @@ inline static void ParseEinsumEquation(
     LabelMap* labeltype,
     std::vector<char>* all_labels,
     std::vector<LabelMap>* label2perms,
-    std::vector<std::vector<int>>* ellipsis_dims,
-    std::vector<int>* broadcast_dims,
+    std::vector<std::vector<int>>* broadcast_shapes,
     std::vector<int>* output_dims,
     std::string* right,
     std::vector<std::string>* input_strs) {
   VLOG(5) << "Start ParseEinsumEquation " << equation;
   auto results = paddle::string::split_string(equation, "->");
   auto left = results[0];
-  ReplaceEllipsis(left);
   *right = results[1];
-  ReplaceEllipsis(*right);
   auto op_labels = paddle::string::split_string(left, ",");
   // split_string("i,") -> ["i", ""], we push back a "".
   // split_string("->") -> [], we push back a "".
-  if (op_labels.size() == 0) op_labels.push_back("");
-  std::for_each(op_labels.begin(), op_labels.end(), ReplaceEllipsis);
+  if (op_labels.empty()) op_labels.emplace_back("");
   GlobalInfo(op_labels, *right, labeltype, all_labels);
-  InferLabelShape(op_labels, inputs, labelshape, ellipsis_dims, broadcast_dims);
+  InferLabelShape(op_labels, inputs, labelshape, broadcast_shapes);
   VLOG(5) << "Einsum Infershape: right:" << *right;
   VLOG(5) << "Einsum Infershape: left :"
           << paddle::string::join_strings(op_labels, '\n');
-  InferOutputDims(*right, *broadcast_dims, *labelshape, output_dims);
+  InferOutputDims(*right, *labelshape, output_dims);
   for (size_t i = 0; i < inputs.size(); ++i) {
-    InferLabelPerm(
-        op_labels[i], ellipsis_dims->at(i).size(), &((*label2perms)[i]));
+    InferLabelPerm(op_labels[i], &((*label2perms)[i]));
     (*input_strs).push_back(std::move(op_labels[i]));
   }
 }
@@ -354,16 +289,11 @@ template <typename T>
 std::vector<T> GetLabelIndexByType(const std::vector<char>& all_labels,
                                    const LabelMap& type,
                                    const LabelMap& perm,
-                                   const std::vector<int>& ellipsis,
                                    LabelType filter) {
   std::vector<T> res;
   for (T c : all_labels) {
     if ((filter == LabelType::ALL_TYPE || type[c] == filter) && perm[c] != -1) {
-      if (c == '.') {
-        for (size_t i = 0; i < ellipsis.size(); ++i) res.push_back(perm[c] + i);
-      } else {
-        res.push_back(perm[c]);
-      }
+      res.push_back(perm[c]);
     }
   }
   return res;
@@ -374,17 +304,13 @@ std::vector<T> GetShapeByType(const std::vector<char>& all_labels,
                               const LabelMap& type,
                               const LabelMap& perm,
                               const LabelMap& label2shape,
-                              const std::vector<int>& ellipsis,
                               std::set<LabelType> filter) {
   std::vector<T> res;
   for (T c : all_labels) {
     if ((filter.count(LabelType::ALL_TYPE) ||
          filter.count(LabelType(type[c]))) &&
         perm[c] != -1) {
-      if (c == '.')
-        res.insert(res.end(), ellipsis.begin(), ellipsis.end());
-      else
-        res.push_back(label2shape[c]);
+      res.push_back(label2shape[c]);
     }
   }
   return res;
@@ -443,29 +369,23 @@ DenseTensor Undiagonal(const Context& dev_ctx,
 template <typename T, typename Context>
 DenseTensor PerformUndiagonal(const Context& dev_ctx,
                               const DenseTensor& tensor,
-                              int n_broadcast,
                               const std::string& equ) {
   //  if the equ is 'iijjkij', then the tensor must be 'ijk', so we have enough
   //  information to do un-diagonal with equ.
   auto res = tensor;
   LabelMap label2perm(-1);
-  InferLabelPerm(equ, n_broadcast, &label2perm);
+  InferLabelPerm(equ, &label2perm);
   // Un-Diagonal
-  int tot =
-      equ.size() + n_broadcast + (equ.find(".") != std::string::npos ? -1 : 0);
+  int tot = equ.size();
   int cur = tot - 1;
   for (auto it = equ.rbegin(); it != equ.rend(); ++it) {
     char c = *it;
-    if (c == '.') {
-      cur -= n_broadcast;
-    } else {
-      if (cur != label2perm[c]) {
-        // do diagonal, followed by movedim().
-        auto insert_pos = cur - tot + res.dims().size() + 1;
-        res = Undiagonal<T, Context>(dev_ctx, res, insert_pos, label2perm[c]);
-      }
-      --cur;
+    if (cur != label2perm[c]) {
+      // do diagonal, followed by movedim().
+      auto insert_pos = cur - tot + res.dims().size() + 1;
+      res = Undiagonal<T, Context>(dev_ctx, res, insert_pos, label2perm[c]);
     }
+    --cur;
   }
   return res;
 }
@@ -476,37 +396,47 @@ DenseTensor PerformDiagonalAndReduction(const Context& dev_ctx,
                                         const std::string& equ,
                                         const LabelMap& label2perm,
                                         const std::vector<char>& all_labels,
-                                        const std::vector<int>& ellipsis,
+                                        const std::vector<int>& broadcast_shape,
                                         const LabelMap& label2type) {
   auto res = tensor;
+  int tot = equ.size();
+  // tiling tensor for broadcast
+  std::vector<int> repeat_times;
+  auto tensor_origin_shape = common::vectorize(tensor.dims());
+  for (size_t i = 0; i < tensor_origin_shape.size(); ++i) {
+    VLOG(4) << "broadcast shape is " << broadcast_shape[i]
+            << ", tensor shape is " << tensor_origin_shape[i];
+    repeat_times.push_back(broadcast_shape[i] / tensor_origin_shape[i]);
+  }
+  DenseTensor after_tile;
+  bool is_all_ones = std::all_of(
+      repeat_times.begin(), repeat_times.end(), [](int x) { return x == 1; });
+  if (!is_all_ones) {
+    TileKernel<T, Context>(dev_ctx, res, repeat_times, &after_tile);
+    res = after_tile;
+  }
   // Diagonal
-  int tot = equ.size() + ellipsis.size() +
-            (equ.find(".") != std::string::npos ? -1 : 0);
   int cur = tot - 1;
   for (auto it = equ.rbegin(); it != equ.rend(); ++it) {
     char c = *it;
-    if (c == '.') {
-      cur -= ellipsis.size();
-    } else {
-      if (cur != label2perm[c]) {
-        // do diagonal, followed by movedim().
-        VLOG(5) << "Do diagonal with shape="
-                << paddle::string::join_strings(
-                       common::vectorize<int>(res.dims()), ',')
-                << ", axis1=" << cur << ", axis2=" << label2perm[c];
-        res = Diagonal<T, Context>(dev_ctx, res, 0, cur, label2perm[c]);
-        res = Transpose<T, Context>(
-            dev_ctx, res, perm_moveto(res.dims().size(), -1, label2perm[c]));
-      }
-      --cur;
+    if (cur != label2perm[c]) {
+      // do diagonal, followed by movedim().
+      VLOG(5) << "Do diagonal with shape="
+              << paddle::string::join_strings(
+                     common::vectorize<int>(res.dims()), ',')
+              << ", axis1=" << cur << ", axis2=" << label2perm[c];
+      res = Diagonal<T, Context>(dev_ctx, res, 0, cur, label2perm[c]);
+      res = Transpose<T, Context>(
+          dev_ctx, res, perm_moveto(res.dims().size(), -1, label2perm[c]));
     }
+    --cur;
   }
   // reduction
   auto indices = GetLabelIndexByType<int64_t>(
-      all_labels, label2type, label2perm, ellipsis, LabelType::Reduction);
+      all_labels, label2type, label2perm, LabelType::Reduction);
   VLOG(5) << "call PerformDiagonalAndReduction: with axis: "
           << paddle::string::join_strings(indices, ",");
-  if (indices.size() == 0) return res;
+  if (indices.empty()) return res;
   return Sum<T, Context>(
       dev_ctx, res, phi::IntArray(indices), res.dtype(), true);
 }
@@ -523,10 +453,9 @@ DenseTensor PerformTranspose(const Context& dev_ctx,
                              const DenseTensor& tensor,
                              const LabelMap& label2perm,
                              const std::vector<char>& all_labels,
-                             const std::vector<int>& ellipsis,
                              const LabelMap& label2type) {
   auto axis = GetLabelIndexByType<int>(
-      all_labels, label2type, label2perm, ellipsis, LabelType::ALL_TYPE);
+      all_labels, label2type, label2perm, LabelType::ALL_TYPE);
   VLOG(5) << "PerformTranspose: " << paddle::string::join_strings(axis, ",");
   if (is_no_need_transpose(axis)) {
     return tensor;
@@ -545,35 +474,24 @@ DenseTensor PerformContraction(
     const std::vector<char>& all_labels,
     const LabelMap& label2type,
     const LabelMap& label2shape,
-    const std::vector<std::vector<int>>& ellipsis_dims,
-    const std::vector<int>& broadcast_dims,
+    const std::vector<std::vector<int>>& broadcast_shapes,
     std::vector<DenseTensor*> cache,
     bool use_cache) {
-  // Get All the Batches, so perm is
   auto all_valid = LabelMap(1);
-  auto recover_dim = GetShapeByType<int>(all_labels,
-                                         label2type,
-                                         all_valid,
-                                         label2shape,
-                                         broadcast_dims,
-                                         {LabelType::Batch});
+  auto recover_dim = GetShapeByType<int>(
+      all_labels, label2type, all_valid, label2shape, {LabelType::Batch});
   auto preprocess = [&](const DenseTensor& t,
                         const LabelMap& perm,
-                        const std::vector<int>& ellipsis,
+                        const std::vector<int>& broadcast,
                         int operand_idx) -> DenseTensor {
     // reshape
     auto frees = GetShapeByType<int>(all_labels,
                                      label2type,
                                      perm,
                                      label2shape,
-                                     ellipsis,
                                      {LabelType::AO, LabelType::BO});
-    auto conts = GetShapeByType<int>(all_labels,
-                                     label2type,
-                                     perm,
-                                     label2shape,
-                                     ellipsis,
-                                     {LabelType::Contraction});
+    auto conts = GetShapeByType<int>(
+        all_labels, label2type, perm, label2shape, {LabelType::Contraction});
     std::vector<char> reordered_all_labels = all_labels;
     if (operand_idx == 1) {
       reordered_all_labels = TransformLabelsOrder(all_labels,
@@ -597,19 +515,15 @@ DenseTensor PerformContraction(
                                                   input_strs[operand_idx],
                                                   perm,
                                                   all_labels,
-                                                  ellipsis,
+                                                  broadcast_shapes[operand_idx],
                                                   label2type);
       trans_t = PerformTranspose<T, Context>(
-          dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
+          dev_ctx, reduct_t, perm, reordered_all_labels, label2type);
       if (cache[operand_idx] != nullptr)
         cache[operand_idx]->ShareBufferWith(trans_t);
     }
-    auto mul_dims = GetShapeByType<int>(all_labels,
-                                        label2type,
-                                        perm,
-                                        label2shape,
-                                        ellipsis,
-                                        {LabelType::Batch});
+    auto mul_dims = GetShapeByType<int>(
+        all_labels, label2type, perm, label2shape, {LabelType::Batch});
     recover_dim.insert(recover_dim.end(), frees.begin(), frees.end());
     if (operand_idx == 0) {
       mul_dims.push_back(std::accumulate(
@@ -632,16 +546,16 @@ DenseTensor PerformContraction(
   DenseTensor after_contraction;
   if (operands.size() == 2) {
     auto trans_a =
-        preprocess(*(operands[0]), label2perm[0], ellipsis_dims[0], 0);
+        preprocess(*(operands[0]), label2perm[0], broadcast_shapes[0], 0);
     auto trans_b =
-        preprocess(*(operands[1]), label2perm[1], ellipsis_dims[1], 1);
+        preprocess(*(operands[1]), label2perm[1], broadcast_shapes[1], 1);
     after_contraction =
         Matmul<T, Context>(dev_ctx, trans_a, trans_b, false, false);
   } else if (operands.size() == 1) {
     after_contraction =
-        preprocess(*(operands[0]), label2perm[0], ellipsis_dims[0], 0);
+        preprocess(*(operands[0]), label2perm[0], broadcast_shapes[0], 0);
   }
-  if (recover_dim.size() == 0) recover_dim.push_back(1);
+  if (recover_dim.empty()) recover_dim.push_back(1);
   VLOG(5) << "PerformContraction: recover_dim: "
           << paddle::string::join_strings(recover_dim, ",");
   after_contraction.Resize(common::make_ddim(recover_dim));
@@ -652,31 +566,24 @@ template <typename T, typename Context>
 DenseTensor TransposeToOutput(const Context& dev_ctx,
                               const DenseTensor& to_trans,
                               const std::vector<char>& right,
-                              const std::vector<char>& all_labels,
-                              int n_broadcast_dims) {
+                              const std::vector<char>& all_labels) {
   std::vector<int> axis;
-  int offset = 0;
-  if (std::find(all_labels.begin(), all_labels.end(), '.') !=
-      all_labels.end()) {
-    offset = n_broadcast_dims - 1;
-  }
   for (char c : right) {
-    if (c == '.') {
-      for (int i = 0; i < n_broadcast_dims; ++i) axis.push_back(i);
-    } else {
-      auto it = std::find(all_labels.begin(), all_labels.end(), c);
-      PADDLE_ENFORCE_NE(it,
-                        all_labels.end(),
-                        phi::errors::InvalidArgument("Must in all_labels."));
-      axis.push_back(it - all_labels.begin() + offset);
-    }
+    auto it = std::find(all_labels.begin(), all_labels.end(), c);
+    PADDLE_ENFORCE_NE(it,
+                      all_labels.end(),
+                      phi::errors::InvalidArgument("Must in all_labels."));
+    axis.push_back(it - all_labels.begin());
   }
   if (is_no_need_transpose(axis)) {
     return to_trans;
   }
   VLOG(5) << "call TransposeToOutput: with axis: "
-          << paddle::string::join_strings(axis, ",");
-  return Transpose<T, Context>(dev_ctx, to_trans, axis);
+          << paddle::string::join_strings(axis, ",")
+          << "  to trans dims is: " << to_trans.dims();
+  auto output = Transpose<T, Context>(dev_ctx, to_trans, axis);
+  VLOG(5) << "After Transpose.";
+  return output;
 }
 
 template <typename T, typename Context>
@@ -687,15 +594,17 @@ void EinsumKernelImpl(const Context& dev_ctx,
                       DenseTensor* out,
                       std::vector<DenseTensor*> cache,
                       bool is_forward = true) {
-  VLOG(5) << "Start EinsumKernelImpl";
+  VLOG(5) << "Start EinsumKernelImpl with inputs(" << inputs.size() << "): ";
+  for (auto& i : inputs) {
+    VLOG(5) << "      inputs [ " << i << " ].shape=" << i->dims();
+  }
   ValidationCheck(equation);
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
   std::vector<LabelMap> label2perms(inputs.size(), LabelMap(-1));
   std::vector<char> all_labels;  // order: ABO, AO, BO, AB, Reduce
-  std::vector<std::vector<int>> ellipsis_dims(2);
-  std::vector<int> broadcast_dims;
+  std::vector<std::vector<int>> broadcast_shapes(2);
   std::vector<int> output_dims;
 
   std::vector<DDim> input_dims;
@@ -713,8 +622,7 @@ void EinsumKernelImpl(const Context& dev_ctx,
                       &labeltype,
                       &all_labels,
                       &label2perms,
-                      &ellipsis_dims,
-                      &broadcast_dims,
+                      &broadcast_shapes,
                       &output_dims,
                       &right,
                       &input_strs);
@@ -730,17 +638,12 @@ void EinsumKernelImpl(const Context& dev_ctx,
                                                           all_labels,
                                                           labeltype,
                                                           labelshape,
-                                                          ellipsis_dims,
-                                                          broadcast_dims,
+                                                          broadcast_shapes,
                                                           cache,
                                                           !is_forward);
-  *out = TransposeToOutput<T, Context>(dev_ctx,
-                                       after_contraction,
-                                       unique_labels(right),
-                                       all_labels,
-                                       broadcast_dims.size());
-  *out = PerformUndiagonal<T, Context>(
-      dev_ctx, *out, broadcast_dims.size(), right);
+  *out = TransposeToOutput<T, Context>(
+      dev_ctx, after_contraction, unique_labels(right), all_labels);
+  *out = PerformUndiagonal<T, Context>(dev_ctx, *out, right);
   out->Resize(common::make_ddim(output_dims));
 }
 
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 3ebca4e85e3c6..01e2c2831ec85 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -698,6 +698,41 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
     return plan
 
 
+def replace_ellipsis(left_equation, rhs, *operands):
+    """
+    we replace ... as unused variables to simplify the EinsumOp implementation.
+    """
+    ellipsis_strings = None
+    max_ndim = 0
+    new_operands = []
+    unused_variables = {chr(c) for c in range(ord('a'), ord('z'))}
+    for equ, operand in zip(left_equation.split(','), operands):
+        ndims = len(operand.shape) - len(equ.replace("...", ""))
+        max_ndim = max(max_ndim, ndims)
+        for c in equ:
+            unused_variables.discard(c)
+
+    for equ, operand in zip(left_equation.split(','), operands):
+        if '...' in equ:
+            start_unsqueeze_idx = equ.index('...')
+            to_squeeze_num = max_ndim - (
+                len(operand.shape) - len(equ.replace("...", ""))
+            )
+            operand = unsqueeze(
+                operand,
+                axis=[i + start_unsqueeze_idx for i in range(to_squeeze_num)],
+            )
+        new_operands.append(operand)
+
+    operands = new_operands
+    ellipsis_strings = ''.join(unused_variables.pop() for _ in range(max_ndim))
+
+    if ellipsis_strings is not None:
+        left_equation = left_equation.replace('...', ellipsis_strings)
+        rhs = rhs.replace('...', ellipsis_strings)
+    return left_equation, rhs, operands
+
+
 def preprocess(equation, *operands):
     """
     check equation / raise error, default right labels generation
@@ -727,7 +762,8 @@ def preprocess(equation, *operands):
         '...' in lhs and '...' not in rhs
     ), 'Invalid equation: missing ellipsis in output labels.'
 
-    return lhs, rhs, labels
+    lhs, rhs, operands = replace_ellipsis(lhs, rhs, *operands)
+    return lhs, rhs, labels, operands
 
 
 def parse_fake_shape(equation, operands, labels):
@@ -806,7 +842,7 @@ def einsum_v2(equation, *operands):
     3. V2 use opt_einsum.contract_path to optimize the multivariable einsum.
     """
     n_op = len(operands)
-    lhs, rhs, labels = preprocess(equation, *operands)
+    lhs, rhs, labels, operands = preprocess(equation, *operands)
 
     if n_op <= 2:
         return gen_einsum_op(lhs + '->' + rhs, *operands)
diff --git a/test/legacy_test/test_einsum_op.py b/test/legacy_test/test_einsum_op.py
index 86b1cff7ea9ff..e41d1766c126e 100644
--- a/test/legacy_test/test_einsum_op.py
+++ b/test/legacy_test/test_einsum_op.py
@@ -120,6 +120,29 @@ def set_mandatory(self):
         self.equation = "ijk,kl->jl"
 
 
+class TestEinsumAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.set_mandatory()
+
+    def test_api(self):
+        inputs = []
+        for shape, ty in zip(self.shapes, self.types):
+            x = paddle.randn(shape).astype(ty)
+            x.stop_gradient = False
+            inputs.append(x)
+        output = paddle.einsum(self.equation, *inputs)
+        expect = np.einsum(self.equation, *[x.numpy() for x in inputs])
+        np.testing.assert_allclose(output.numpy(), expect)
+        output = output.mean()
+        output.backward()
+
+    def set_mandatory(self):
+        self.shapes = [(10,), (10,)]
+        self.types = [np.float64, np.float64]
+        self.equation = "...,..."
+
+
 class TestEinsumWithReduction1(TestEinsumBinary):
     def set_mandatory(self):
         self.shapes = [(10, 3, 3, 5), (10, 5, 10, 10)]
@@ -142,34 +165,41 @@ def set_mandatory(self):
 
 
 class TestEinsumWithBroadcast1(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(5, 10, 3, 3)]
+        self.types = [np.float64]
+        self.equation = "ixyz->xyz"
+
+
+class TestEinsumWithBroadcast1API(TestEinsumAPI):
     def set_mandatory(self):
         self.shapes = [(5, 10, 3, 3)]
         self.types = [np.float64]
         self.equation = "i...->..."
 
 
-class TestEinsumWithBroadcast2(TestEinsumBinary):
+class TestEinsumWithBroadcast2(TestEinsumAPI):
     def set_mandatory(self):
         self.shapes = [(10, 11), (3, 4, 5, 10)]
         self.types = [np.float64, np.float64]
         self.equation = "...ij,...i->j..."
 
 
-class TestEinsumWithBroadcast3(TestEinsumBinary):
+class TestEinsumWithBroadcast3(TestEinsumAPI):
     def set_mandatory(self):
         self.shapes = [(10, 3, 2, 3, 4), (12, 10)]
         self.types = [np.float64, np.float64]
         self.equation = "k...,...jk->...k"
 
 
-class TestEinsumWithBroadcast4(TestEinsumBinary):
+class TestEinsumWithBroadcast4(TestEinsumAPI):
     def set_mandatory(self):
         self.shapes = [(10, 3, 2, 3, 4), (12, 10)]
         self.types = [np.float64, np.float64]
         self.equation = "a...d,...cb->...abcd"
 
 
-class TestEinsumWithBroadcast5(TestEinsumBinary):
+class TestEinsumWithBroadcast5(TestEinsumAPI):
     def set_mandatory(self):
         self.shapes = [(3, 2, 2, 10), (10, 3, 2, 2)]
         self.types = [np.float64, np.float64]
@@ -183,6 +213,13 @@ def set_mandatory(self):
         self.equation = "i,i->"
 
 
+class TestEinsumWithBroadcast7(TestEinsumAPI):
+    def set_mandatory(self):
+        self.shapes = [(32, 13, 13, 12, 12), (1, 12)]
+        self.types = [np.float64, np.float64]
+        self.equation = "...ii,...i->...i"
+
+
 class TestEinsumWithDiagonal(TestEinsumBinary):
     def set_mandatory(self):
         self.shapes = [(10, 10)]
@@ -198,6 +235,13 @@ def set_mandatory(self):
 
 
 class TestEinsumWithDiagonal3(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(5, 3, 2, 1, 4, 5)]
+        self.types = [np.float64]
+        self.equation = "axyzwa->xyzw"
+
+
+class TestEinsumWithDiagonal3API(TestEinsumAPI):
     def set_mandatory(self):
         self.shapes = [(5, 3, 2, 1, 4, 5)]
         self.types = [np.float64]
@@ -205,6 +249,13 @@ def set_mandatory(self):
 
 
 class TestEinsumWithDiagonal4(TestEinsumBinary):
+    def set_mandatory(self):
+        self.shapes = [(5, 3, 2, 1, 4, 5)]
+        self.types = [np.float64]
+        self.equation = "axyzwa->axyzw"
+
+
+class TestEinsumWithDiagonal4API(TestEinsumAPI):
     def set_mandatory(self):
         self.shapes = [(5, 3, 2, 1, 4, 5)]
         self.types = [np.float64]
diff --git a/test/xpu/test_einsum_op_xpu.py b/test/xpu/test_einsum_op_xpu.py
index 57a82009834fa..540d75466b7a4 100644
--- a/test/xpu/test_einsum_op_xpu.py
+++ b/test/xpu/test_einsum_op_xpu.py
@@ -116,26 +116,6 @@ def set_mandatory(self):
             self.shapes = [(5, 10, 3, 3), (3, 6, 3, 10)]
             self.equation = "imjl,jklm->imk"
 
-    class TestEinsumWithBroadcast1(TestEinsumBinary):
-        def set_mandatory(self):
-            self.shapes = [(5, 10, 3, 3)]
-            self.equation = "i...->..."
-
-    class TestEinsumWithBroadcast2(TestEinsumBinary):
-        def set_mandatory(self):
-            self.shapes = [(10, 11), (3, 4, 5, 10)]
-            self.equation = "...ij,...i->j..."
-
-    class TestEinsumWithBroadcast4(TestEinsumBinary):
-        def set_mandatory(self):
-            self.shapes = [(10, 3, 2, 3, 4), (12, 10)]
-            self.equation = "a...d,...cb->...abcd"
-
-    class TestEinsumWithBroadcast5(TestEinsumBinary):
-        def set_mandatory(self):
-            self.shapes = [(3, 2, 2, 10), (10, 3, 2, 2)]
-            self.equation = "...a,a...->..."
-
     class TestEinsumWithBroadcast6(TestEinsumBinary):
         def set_mandatory(self):
             self.shapes = [(100), (100)]
@@ -151,16 +131,6 @@ def set_mandatory(self):
             self.shapes = [(10, 3, 10)]
             self.equation = "iji->j"
 
-    class TestEinsumWithDiagonal3(TestEinsumBinary):
-        def set_mandatory(self):
-            self.shapes = [(5, 3, 2, 1, 4, 5)]
-            self.equation = "a...a->..."
-
-    class TestEinsumWithDiagonal4(TestEinsumBinary):
-        def set_mandatory(self):
-            self.shapes = [(5, 3, 2, 1, 4, 5)]
-            self.equation = "a...a->a..."
-
     class TestEinsumWithDiagonal5(TestEinsumBinary):
         def set_mandatory(self):
             self.shapes = [(8, 8, 8)]
@@ -182,5 +152,72 @@ def set_mandatory(self):
     create_test_class(globals(), XPUTestEinsumOp, stype)
 
 
+class TestEinsumAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.set_mandatory()
+
+    def test_api(self):
+        inputs = []
+        for shape, ty in zip(self.shapes, self.types):
+            x = paddle.randn(shape).astype(ty)
+            x.stop_gradient = False
+            inputs.append(x)
+        output = paddle.einsum(self.equation, *inputs)
+        expect = np.einsum(self.equation, *[x.numpy() for x in inputs])
+        np.testing.assert_allclose(
+            output.numpy(), expect, atol=0.0006, rtol=0.0001
+        )
+        output = output.mean()
+        output.backward()
+
+    def set_mandatory(self):
+        self.shapes = [(10,), (10,)]
+        self.types = [np.float32, np.float32]
+        self.equation = "...,..."
+
+
+class TestEinsumWithBroadcast1(TestEinsumAPI):
+    def set_mandatory(self):
+        self.shapes = [(5, 10, 3, 3)]
+        self.types = [np.float32]
+        self.equation = "i...->..."
+
+
+class TestEinsumWithBroadcast2(TestEinsumAPI):
+    def set_mandatory(self):
+        self.shapes = [(10, 11), (3, 4, 5, 10)]
+        self.types = [np.float32, np.float32]
+        self.equation = "...ij,...i->j..."
+
+
+class TestEinsumWithBroadcast4(TestEinsumAPI):
+    def set_mandatory(self):
+        self.shapes = [(10, 3, 2, 3, 4), (12, 10)]
+        self.types = [np.float32, np.float32]
+        self.equation = "a...d,...cb->...abcd"
+
+
+class TestEinsumWithBroadcast5(TestEinsumAPI):
+    def set_mandatory(self):
+        self.shapes = [(3, 2, 2, 10), (10, 3, 2, 2)]
+        self.types = [np.float32, np.float32]
+        self.equation = "...a,a...->..."
+
+
+class TestEinsumWithDiagonal3(TestEinsumAPI):
+    def set_mandatory(self):
+        self.shapes = [(5, 3, 2, 1, 4, 5)]
+        self.types = [np.float32]
+        self.equation = "a...a->..."
+
+
+class TestEinsumWithDiagonal4(TestEinsumAPI):
+    def set_mandatory(self):
+        self.shapes = [(5, 3, 2, 1, 4, 5)]
+        self.types = [np.float32]
+        self.equation = "a...a->a..."
+
+
 if __name__ == "__main__":
     unittest.main()

From d31684503f5c7e29400ec82b3b0d9fcb780725ca Mon Sep 17 00:00:00 2001
From: PommesPeter <54879512+PommesPeter@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:15:30 +0800
Subject: [PATCH 07/82] [Cleanup][B-2] clean some
 paddle.base.dygraph.to_variable for test (#61904)

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>
---
 .../seq2seq_dygraph_model.py                  | 107 ++++++++----------
 .../dygraph_to_static/simnet_dygraph_model.py |  21 ++--
 .../test_basic_api_transformation.py          |  27 +----
 test/dygraph_to_static/test_declarative.py    |  15 +--
 test/legacy_test/test_activation_op.py        |  14 +--
 test/legacy_test/test_adam_op.py              |   4 +-
 test/legacy_test/test_adaptive_avg_pool1d.py  |   2 +-
 test/legacy_test/test_adaptive_max_pool1d.py  |   2 +-
 test/legacy_test/test_addmm_op.py             |   6 +-
 test/legacy_test/test_affine_grid_function.py |   6 +-
 test/legacy_test/test_array_read_write_op.py  |   6 +-
 11 files changed, 88 insertions(+), 122 deletions(-)

diff --git a/test/dygraph_to_static/seq2seq_dygraph_model.py b/test/dygraph_to_static/seq2seq_dygraph_model.py
index 2359a7df50239..9be5ab3f5fe08 100644
--- a/test/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/test/dygraph_to_static/seq2seq_dygraph_model.py
@@ -12,13 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 from seq2seq_utils import Seq2SeqModelHyperParams as args
 
 import paddle
-from paddle import base
-from paddle.base import ParamAttr
-from paddle.base.dygraph.base import to_variable
 from paddle.nn import Embedding, Layer
 
 INF = 1.0 * 1e5
@@ -112,14 +108,16 @@ def __init__(
         self.mode = mode
         self.kinf = 1e9
 
-        param_attr = ParamAttr(initializer=uniform_initializer(self.init_scale))
-        bias_attr = ParamAttr(initializer=zero_constant)
+        param_attr = paddle.ParamAttr(
+            initializer=uniform_initializer(self.init_scale)
+        )
+        bias_attr = paddle.ParamAttr(initializer=zero_constant)
         forget_bias = 1.0
 
         self.src_embeder = Embedding(
             self.src_vocab_size,
             self.hidden_size,
-            weight_attr=base.ParamAttr(
+            weight_attr=paddle.ParamAttr(
                 initializer=uniform_initializer(init_scale)
             ),
         )
@@ -128,7 +126,7 @@ def __init__(
             self.tar_vocab_size,
             self.hidden_size,
             sparse=False,
-            weight_attr=base.ParamAttr(
+            weight_attr=paddle.ParamAttr(
                 initializer=uniform_initializer(init_scale)
             ),
         )
@@ -137,7 +135,7 @@ def __init__(
         for i in range(num_layers):
             self.enc_units.append(
                 self.add_sublayer(
-                    "enc_units_%d" % i,
+                    f"enc_units_{i}",
                     BasicLSTMUnit(
                         hidden_size=self.hidden_size,
                         input_size=self.hidden_size,
@@ -152,7 +150,7 @@ def __init__(
         for i in range(num_layers):
             self.dec_units.append(
                 self.add_sublayer(
-                    "dec_units_%d" % i,
+                    f"dec_units_{i}",
                     BasicLSTMUnit(
                         hidden_size=self.hidden_size,
                         input_size=self.hidden_size,
@@ -211,11 +209,11 @@ def forward(self, inputs):
 
         # NOTE: modify model code about `enc_hidden` and `enc_cell` to transforme dygraph code successfully.
         # Because nested list can't be transformed now.
-        enc_hidden_0 = to_variable(
-            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
+        enc_hidden_0 = paddle.zeros(
+            shape=[self.batch_size, self.hidden_size], dtype='float32'
         )
-        enc_cell_0 = to_variable(
-            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
+        enc_cell_0 = paddle.zeros(
+            shape=[self.batch_size, self.hidden_size], dtype='float32'
         )
         zero = paddle.zeros(shape=[1], dtype="int64")
         enc_hidden = paddle.tensor.create_array(dtype="float32")
@@ -292,8 +290,8 @@ def forward(self, inputs):
 
         dec_output = paddle.stack(dec_output)
         dec_output = self.fc(self._transpose_batch_time(dec_output))
-        loss = paddle.nn.functional.softmax_with_cross_entropy(
-            logits=dec_output, label=label, soft_label=False
+        loss = paddle.nn.functional.cross_entropy(
+            input=dec_output, label=label, soft_label=False, reduction="none"
         )
         loss = paddle.squeeze(loss, axis=[2])
         max_tar_seq_len = paddle.shape(tar)[1]
@@ -312,11 +310,11 @@ def beam_search(self, inputs):
             self.batch_size = src.shape[0]
 
         src_emb = self.src_embeder(self._transpose_batch_time(src))
-        enc_hidden_0 = to_variable(
-            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
+        enc_hidden_0 = paddle.zeros(
+            shape=[self.batch_size, self.hidden_size], dtype='float32'
         )
-        enc_cell_0 = to_variable(
-            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
+        enc_cell_0 = paddle.zeros(
+            shape=[self.batch_size, self.hidden_size], dtype='float32'
         )
         zero = paddle.zeros(shape=[1], dtype="int64")
         enc_hidden = paddle.tensor.create_array(dtype="float32")
@@ -367,23 +365,17 @@ def beam_search(self, inputs):
 
         # beam search
         batch_beam_shape = (self.batch_size, self.beam_size)
-        vocab_size_tensor = to_variable(
-            np.full((1), self.tar_vocab_size)
-        ).astype("int64")
-        start_token_tensor = to_variable(
-            np.full(batch_beam_shape, self.beam_start_token, dtype='int64')
+        vocab_size_tensor = paddle.full([1], self.tar_vocab_size, dtype="int64")
+        start_token_tensor = paddle.full(
+            batch_beam_shape, self.beam_start_token, dtype="int64"
         )
-        end_token_tensor = to_variable(
-            np.full(batch_beam_shape, self.beam_end_token, dtype='int64')
+        end_token_tensor = paddle.full(
+            batch_beam_shape, self.beam_end_token, dtype="int64"
         )
         step_input = self.tar_embeder(start_token_tensor)
-        beam_finished = to_variable(
-            np.full(batch_beam_shape, 0, dtype='float32')
-        )
-        beam_state_log_probs = to_variable(
-            np.array(
-                [[0.0] + [-self.kinf] * (self.beam_size - 1)], dtype="float32"
-            )
+        beam_finished = paddle.full(batch_beam_shape, 0, dtype="float32")
+        beam_state_log_probs = paddle.to_tensor(
+            [[0.0] + [-self.kinf] * (self.beam_size - 1)], dtype="float32"
         )
         beam_state_log_probs = paddle.expand(
             beam_state_log_probs,
@@ -395,8 +387,7 @@ def beam_search(self, inputs):
 
         batch_pos = paddle.expand(
             paddle.unsqueeze(
-                to_variable(np.arange(0, self.batch_size, 1, dtype="int64")),
-                [1],
+                paddle.arange(0, self.batch_size, 1, dtype="int64"), [1]
             ),
             [-1, self.beam_size],
         )
@@ -437,9 +428,7 @@ def beam_search(self, inputs):
             )
             noend_array = [-self.kinf] * self.tar_vocab_size
             noend_array[self.beam_end_token] = 0
-            noend_mask_tensor = to_variable(
-                np.array(noend_array, dtype='float32')
-            )
+            noend_mask_tensor = paddle.to_tensor(noend_array, dtype="float32")
 
             step_log_probs = paddle.multiply(
                 paddle.expand(
@@ -537,14 +526,16 @@ def __init__(
         self.mode = mode
         self.kinf = 1e9
 
-        param_attr = ParamAttr(initializer=uniform_initializer(self.init_scale))
-        bias_attr = ParamAttr(initializer=zero_constant)
+        param_attr = paddle.ParamAttr(
+            initializer=uniform_initializer(self.init_scale)
+        )
+        bias_attr = paddle.ParamAttr(initializer=zero_constant)
         forget_bias = 1.0
 
         self.src_embeder = Embedding(
             self.src_vocab_size,
             self.hidden_size,
-            weight_attr=base.ParamAttr(
+            weight_attr=paddle.ParamAttr(
                 name='source_embedding',
                 initializer=uniform_initializer(init_scale),
             ),
@@ -554,7 +545,7 @@ def __init__(
             self.tar_vocab_size,
             self.hidden_size,
             sparse=False,
-            weight_attr=base.ParamAttr(
+            weight_attr=paddle.ParamAttr(
                 name='target_embedding',
                 initializer=uniform_initializer(init_scale),
             ),
@@ -564,7 +555,7 @@ def __init__(
         for i in range(num_layers):
             self.enc_units.append(
                 self.add_sublayer(
-                    "enc_units_%d" % i,
+                    f"enc_units_{i}",
                     BasicLSTMUnit(
                         hidden_size=self.hidden_size,
                         input_size=self.hidden_size,
@@ -580,12 +571,12 @@ def __init__(
             if i == 0:
                 self.dec_units.append(
                     self.add_sublayer(
-                        "dec_units_%d" % i,
+                        f"dec_units_{i}",
                         BasicLSTMUnit(
                             hidden_size=self.hidden_size,
                             input_size=self.hidden_size * 2,
-                            param_attr=ParamAttr(
-                                name="dec_units_%d" % i,
+                            param_attr=paddle.ParamAttr(
+                                name=f"dec_units_{i}",
                                 initializer=uniform_initializer(
                                     self.init_scale
                                 ),
@@ -598,12 +589,12 @@ def __init__(
             else:
                 self.dec_units.append(
                     self.add_sublayer(
-                        "dec_units_%d" % i,
+                        f"dec_units_{i}",
                         BasicLSTMUnit(
                             hidden_size=self.hidden_size,
                             input_size=self.hidden_size,
-                            param_attr=ParamAttr(
-                                name="dec_units_%d" % i,
+                            param_attr=paddle.ParamAttr(
+                                name=f"dec_units_{i}",
                                 initializer=uniform_initializer(
                                     self.init_scale
                                 ),
@@ -726,12 +717,12 @@ def forward(self, inputs):
 
         # NOTE: modify model code about `enc_hidden` and `enc_cell` to transform dygraph code successfully.
         # Because nested list can't be transformed now.
-        enc_hidden_0 = to_variable(
-            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
+        enc_hidden_0 = paddle.zeros(
+            shape=[self.batch_size, self.hidden_size], dtype='float32'
         )
         enc_hidden_0.stop_gradient = True
-        enc_cell_0 = to_variable(
-            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
+        enc_cell_0 = paddle.zeros(
+            shape=[self.batch_size, self.hidden_size], dtype='float32'
         )
         enc_hidden_0.stop_gradient = True
         zero = paddle.zeros(shape=[1], dtype="int64")
@@ -789,8 +780,8 @@ def forward(self, inputs):
         enc_outputs = self._transpose_batch_time(enc_outputs)
 
         # train
-        input_feed = to_variable(
-            np.zeros((self.batch_size, self.hidden_size), dtype='float32')
+        input_feed = paddle.zeros(
+            shape=[self.batch_size, self.hidden_size], dtype='float32'
         )
         # NOTE: set stop_gradient here, otherwise grad var is null
         input_feed.stop_gradient = True
@@ -828,8 +819,8 @@ def forward(self, inputs):
 
         dec_output = paddle.stack(dec_output)
         dec_output = self.fc(self._transpose_batch_time(dec_output))
-        loss = paddle.nn.functional.softmax_with_cross_entropy(
-            logits=dec_output, label=label, soft_label=False
+        loss = paddle.nn.functional.cross_entropy(
+            input=dec_output, label=label, soft_label=False, reduction="none"
         )
         loss = paddle.squeeze(loss, axis=[2])
         max_tar_seq_len = paddle.shape(tar)[1]
diff --git a/test/dygraph_to_static/simnet_dygraph_model.py b/test/dygraph_to_static/simnet_dygraph_model.py
index 86d3071f616e5..519f689c77795 100644
--- a/test/dygraph_to_static/simnet_dygraph_model.py
+++ b/test/dygraph_to_static/simnet_dygraph_model.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -298,15 +298,16 @@ class FC(paddle.nn.Layer):
 
     Examples:
         .. code-block:: python
-          from paddle.base.dygraph.base import to_variable
-          import paddle.base as base
-          from paddle.base.dygraph import FC
-          import numpy as np
-          data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
-          with base.dygraph.guard():
-              fc = FC("fc", 64, num_flatten_dims=2)
-              data = to_variable(data)
-              conv = fc(data)
+
+            import paddle
+            import paddle.base as base
+            from paddle.base.dygraph import FC
+            import numpy as np
+            data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
+            with base.dygraph.guard():
+                fc = FC("fc", 64, num_flatten_dims=2)
+                data = paddle.to_tensor(data)
+                conv = fc(data)
     """
 
     def __init__(
diff --git a/test/dygraph_to_static/test_basic_api_transformation.py b/test/dygraph_to_static/test_basic_api_transformation.py
index 2a2134d318267..e4dfb37af5faa 100644
--- a/test/dygraph_to_static/test_basic_api_transformation.py
+++ b/test/dygraph_to_static/test_basic_api_transformation.py
@@ -21,30 +21,13 @@
 )
 
 import paddle
-from paddle import base, to_tensor
-from paddle.base import dygraph
-from paddle.base.dygraph import to_variable
+from paddle import to_tensor
 from paddle.jit.api import to_static
 
 SEED = 2020
 np.random.seed(SEED)
 
 
-def dyfunc_to_variable(x):
-    res = base.dygraph.to_variable(x, name=None, zero_copy=None)
-    return res
-
-
-def dyfunc_to_variable_2(x):
-    res = dygraph.to_variable(value=np.zeros(shape=(1), dtype=np.int32))
-    return res
-
-
-def dyfunc_to_variable_3(x):
-    res = to_variable(x, name=None, zero_copy=None)
-    return res
-
-
 def dyfunc_to_tensor(x):
     res1 = paddle.to_tensor(x, dtype=None, place=None, stop_gradient=True)
     res2 = paddle.tensor.to_tensor(data=res1)
@@ -73,15 +56,7 @@ def setUp(self):
             dyfunc_bool_to_tensor,
             dyfunc_int_to_tensor,
             dyfunc_float_to_tensor,
-            dyfunc_to_variable,
-            dyfunc_to_variable_2,
-            dyfunc_to_variable_3,
         ]
-        self.place = (
-            paddle.CUDAPlace(0)
-            if paddle.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
 
     def get_dygraph_output(self):
         res = self.dygraph_func(self.input).numpy()
diff --git a/test/dygraph_to_static/test_declarative.py b/test/dygraph_to_static/test_declarative.py
index df3a136222f62..1ee370b1745bf 100644
--- a/test/dygraph_to_static/test_declarative.py
+++ b/test/dygraph_to_static/test_declarative.py
@@ -19,11 +19,9 @@
 import numpy as np
 from dygraph_to_static_utils import (
     Dy2StTestBase,
-    enable_to_static_guard,
     test_ast_only,
     test_legacy_and_pt_and_pir,
 )
-from test_basic_api_transformation import dyfunc_to_variable
 
 import paddle
 from paddle.framework import use_pir_api
@@ -35,6 +33,11 @@
 from paddle.static import InputSpec
 
 
+def call_to_tensor(x):
+    res = paddle.to_tensor(x)
+    return res
+
+
 def create_simple_net():
     class SimpleNet(Layer):
         def __init__(self):
@@ -370,7 +373,7 @@ class TestDeclarativeAPI(Dy2StTestBase):
     @test_ast_only
     @test_legacy_and_pt_and_pir
     def test_error(self):
-        func = paddle.jit.to_static(dyfunc_to_variable)
+        func = paddle.jit.to_static(call_to_tensor)
 
         paddle.enable_static()
 
@@ -379,12 +382,6 @@ def test_error(self):
         with self.assertRaises(RuntimeError):
             func(np.ones(5).astype("int32"))
 
-        with enable_to_static_guard(False):
-            with self.assertRaises(AssertionError):
-                # AssertionError: We Only support to_variable in imperative mode,
-                #  please use base.dygraph.guard() as context to run it in imperative Mode
-                func(np.ones(5).astype("int32"))
-
         paddle.disable_static()
 
 
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index 485c770dd96df..deecf7fd09a9e 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -370,7 +370,7 @@ def test_out_name(self):
     def test_dygraph(self):
         with base.dygraph.guard():
             np_x = np.array([0.1])
-            x = base.dygraph.to_variable(np_x)
+            x = paddle.to_tensor(np_x)
             z = eval("paddle.%s(x).numpy()" % self.op_type)
             z_expected = eval("np.%s(np_x)" % self.op_type)
             np.testing.assert_allclose(z, z_expected, rtol=1e-05)
@@ -963,7 +963,7 @@ def test_out_name(self):
     def test_dygraph(self):
         with base.dygraph.guard():
             np_x = np.array([0.1])
-            x = base.dygraph.to_variable(np_x)
+            x = paddle.to_tensor(np_x)
             z = paddle.atan(x).numpy()
             z_expected = np.arctan(np_x)
             self.assertEqual(z, z_expected)
@@ -1036,7 +1036,7 @@ class TestSinhAPI(unittest.TestCase):
     def test_dygraph(self):
         with base.dygraph.guard():
             np_x = np.array([0.1])
-            x = base.dygraph.to_variable(np_x)
+            x = paddle.to_tensor(np_x)
             z = paddle.sinh(x).numpy()
             z_expected = np.sinh(np_x)
             np.testing.assert_allclose(z, z_expected, rtol=1e-05)
@@ -1075,7 +1075,7 @@ def test_backward(self):
             input_x = np.random.uniform(0.1, 1, test_data_shape).astype(
                 "float32"
             )
-            var = base.dygraph.to_variable(input_x)
+            var = paddle.to_tensor(input_x)
             var.stop_gradient = False
             loss = paddle.sinh(var)
             loss.backward()
@@ -1168,7 +1168,7 @@ class TestCoshAPI(unittest.TestCase):
     def test_dygraph(self):
         with base.dygraph.guard():
             np_x = np.array([0.1])
-            x = base.dygraph.to_variable(np_x)
+            x = paddle.to_tensor(np_x)
             z = paddle.cosh(x).numpy()
             z_expected = np.cosh(np_x)
             np.testing.assert_allclose(z, z_expected, rtol=1e-05)
@@ -1206,7 +1206,7 @@ def test_backward(self):
             input_x = np.random.uniform(0.1, 1, test_data_shape).astype(
                 "float32"
             )
-            var = base.dygraph.to_variable(input_x)
+            var = paddle.to_tensor(input_x)
             var.stop_gradient = False
             loss = paddle.cosh(var)
             loss.backward()
@@ -4067,7 +4067,7 @@ def test_api(self):
         # dygraph
         with base.dygraph.guard():
             np_x = np.random.uniform(0.1, 1, [11, 17]).astype("float64")
-            data_x = base.dygraph.to_variable(np_x)
+            data_x = paddle.to_tensor(np_x)
             z = paddle.log1p(data_x)
             np_z = z.numpy()
             z_expected = np.array(np.log1p(np_x))
diff --git a/test/legacy_test/test_adam_op.py b/test/legacy_test/test_adam_op.py
index 9a4e3c15553b5..c06e249a874e0 100644
--- a/test/legacy_test/test_adam_op.py
+++ b/test/legacy_test/test_adam_op.py
@@ -725,7 +725,7 @@ def test_pir_adam_op(self):
     def test_adam_op_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
-        a = base.dygraph.to_variable(value)
+        a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
 
         adam = paddle.optimizer.Adam(
@@ -773,7 +773,7 @@ def test_adam_op_with_state_dict(self):
     def test_adam_with_grad_clip(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
-        a = base.dygraph.to_variable(value)
+        a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
         adam = paddle.optimizer.Adam(
diff --git a/test/legacy_test/test_adaptive_avg_pool1d.py b/test/legacy_test/test_adaptive_avg_pool1d.py
index bca37ba88794f..64075167363aa 100644
--- a/test/legacy_test/test_adaptive_avg_pool1d.py
+++ b/test/legacy_test/test_adaptive_avg_pool1d.py
@@ -87,7 +87,7 @@ def setUp(self):
     def check_adaptive_avg_dygraph_results(self, place):
         with base.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = base.dygraph.to_variable(input_np)
+            input = paddle.to_tensor(input_np)
             result = F.adaptive_avg_pool1d(input, output_size=16)
             result_np = avg_pool1D_forward_naive(
                 input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True
diff --git a/test/legacy_test/test_adaptive_max_pool1d.py b/test/legacy_test/test_adaptive_max_pool1d.py
index eb12c8d597ba9..33aaa8565bb4f 100644
--- a/test/legacy_test/test_adaptive_max_pool1d.py
+++ b/test/legacy_test/test_adaptive_max_pool1d.py
@@ -78,7 +78,7 @@ def setUp(self):
     def check_adaptive_max_dygraph_results(self, place):
         with base.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
-            input = base.dygraph.to_variable(input_np)
+            input = paddle.to_tensor(input_np)
             result = F.adaptive_max_pool1d(input, output_size=16)
 
             result_np = max_pool1D_forward_naive(
diff --git a/test/legacy_test/test_addmm_op.py b/test/legacy_test/test_addmm_op.py
index 1e339ad1ceb68..b41532cfe2fac 100644
--- a/test/legacy_test/test_addmm_op.py
+++ b/test/legacy_test/test_addmm_op.py
@@ -324,9 +324,9 @@ def test_api_with_dygraph(self):
         np_y = np.random.random((6, 30)).astype(np.float32)
 
         with base.dygraph.guard():
-            input = base.dygraph.to_variable(np_input)
-            x = base.dygraph.to_variable(np_x)
-            y = base.dygraph.to_variable(np_y)
+            input = paddle.to_tensor(np_input)
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
             out = paddle.tensor.addmm(input, x, y)
             np.testing.assert_allclose(
                 np_input + np.dot(np_x, np_y), out.numpy(), rtol=1e-5, atol=1e-8
diff --git a/test/legacy_test/test_affine_grid_function.py b/test/legacy_test/test_affine_grid_function.py
index a3c01722b7449..e3a2455eed3e4 100644
--- a/test/legacy_test/test_affine_grid_function.py
+++ b/test/legacy_test/test_affine_grid_function.py
@@ -122,10 +122,12 @@ def test_static_api(self):
     def paddle_dygraph_layer(self):
         paddle.disable_static()
         theta_var = (
-            dg.to_variable(self.theta) if not self.invalid_theta else "invalid"
+            paddle.to_tensor(self.theta)
+            if not self.invalid_theta
+            else "invalid"
         )
         output_shape = (
-            dg.to_variable(self.output_shape)
+            paddle.to_tensor(self.output_shape)
             if self.variable_output_shape
             else self.output_shape
         )
diff --git a/test/legacy_test/test_array_read_write_op.py b/test/legacy_test/test_array_read_write_op.py
index 05452a9690e2c..499691ef9277c 100644
--- a/test/legacy_test/test_array_read_write_op.py
+++ b/test/legacy_test/test_array_read_write_op.py
@@ -106,9 +106,9 @@ def test_read_write(self):
         self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
 
         with base.dygraph.guard(place):
-            tensor1 = base.dygraph.to_variable(tensor)
-            tensor2 = base.dygraph.to_variable(tensor)
-            tensor3 = base.dygraph.to_variable(tensor)
+            tensor1 = paddle.to_tensor(tensor)
+            tensor2 = paddle.to_tensor(tensor)
+            tensor3 = paddle.to_tensor(tensor)
             x_dygraph = [tensor1, tensor2, tensor3]
             for each_x in x_dygraph:
                 each_x.stop_gradient = False

From d470588e3aac11ed3b16d98405edf60ca505e270 Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:36:22 +0800
Subject: [PATCH 08/82] [PIR][DynamicShape] Add more ops' InferSymbolicShape
 (#61930)

* Add more ops' InferSymbolicShape

* bug fix

* bug fix

* add log op
---
 paddle/cinn/hlir/dialect/operator/ir/ops.yaml |   1 +
 .../infer_symbolic_shape/cinn_op_infer_sym.cc |  19 +++
 .../infer_symbolic_shape/cinn_op_infer_sym.h  |   2 +
 .../infer_sym_element_wise_binary.cc          |  64 ++++++++--
 .../infer_sym_element_wise_binary.h           |  39 +++++-
 .../paddle_op_infer_sym.cc                    | 111 +-----------------
 .../paddle_op_infer_sym.h                     |  54 ---------
 .../same_operands_and_result.cc               |  40 +++++++
 .../same_operands_and_result.h                |  24 ++++
 9 files changed, 179 insertions(+), 175 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
index 9ab3e9381cc44..4faaf8ea2209f 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
+++ b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
@@ -7,6 +7,7 @@
   kernel :
     func : expand
     param : [x, broadcast_axes]
+  interfaces : paddle::dialect::InferSymbolicShapeInterface
 
 - op : isclose
   args : (Tensor x, Tensor y, float rtol=1e-5, float atol=1e-8,  bool equal_nan=false)
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index 443981f9ef080..ecb56292e170a 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -17,6 +17,25 @@
 
 namespace cinn::dialect {
 
+bool BroadcastOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  const std::vector<int64_t> &shape =
+      paddle::dialect::details::GetVectorAttr<int64_t>(op, "out_shape");
+
+  const std::vector<symbol::DimExpr> &out_dims = [&] {
+    std::vector<symbol::DimExpr> out_dims;
+    for (int64_t dim : shape) {
+      out_dims.emplace_back(dim);
+    }
+    return out_dims;
+  }();
+
+  symbol::ShapeOrDataDimExprs shape_data{
+      symbol::TensorShapeOrDataDimExprs(out_dims)};
+  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+  return true;
+}
+
 bool SliceOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   // TODO(zhangbopd): Not implemented yet, different from the one in paddle
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
index fad7d4893d037..896dd44d0b12b 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
@@ -18,6 +18,8 @@
 namespace cinn::dialect {
 // using paddle::dialect::ScaleOpInferSymbolicShape;
 
+bool BroadcastOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool SliceOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool ConcatOpInferSymbolicShape(pir::Operation *op,
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
index 845647b4a79d0..21da5351c617d 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.cc
@@ -90,35 +90,85 @@ bool Add_OpInferSymbolicShape(pir::Operation *op,
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
 
-bool MultiplyOpInferSymbolicShape(
+bool BitwiseAndOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-bool MultiplySrOpInferSymbolicShape(
+
+bool BitwiseAnd_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return BitwiseAndOpInferSymbolicShape(op, shape_analysis);
+}
+
+bool DivideOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-bool Multiply_OpInferSymbolicShape(
+bool Divide_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-bool MultiplySr_OpInferSymbolicShape(
+
+bool ElementwisePowOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
 
-bool DivideOpInferSymbolicShape(
+bool GreaterThanOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
-bool Divide_OpInferSymbolicShape(
+
+bool GreaterThan_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return GreaterThanOpInferSymbolicShape(op, shape_analysis);
+}
+
+bool LessThanOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
 
-bool ElementwisePowOpInferSymbolicShape(
+bool LessThan_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return LessThanOpInferSymbolicShape(op, shape_analysis);
+}
+
+bool LogicalAndOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
 }
 
+bool LogicalAnd_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return LogicalAndOpInferSymbolicShape(op, shape_analysis);
+}
+
+bool MultiplyOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool MultiplySrOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool Multiply_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+bool MultiplySr_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+
+bool NotEqualOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return InferSymbolicShapeElementWiseBinary(op, shape_analysis);
+}
+
+bool NotEqual_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return NotEqualOpInferSymbolicShape(op, shape_analysis);
+}
+
 }  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
index 21aff3276bf1f..e15d769fc8b02 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_element_wise_binary.h
@@ -23,6 +23,38 @@ bool AddOpInferSymbolicShape(pir::Operation *op,
 bool Add_OpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis);
 
+bool BitwiseAndOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool BitwiseAnd_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool DivideOpInferSymbolicShape(pir::Operation *op,
+                                pir::ShapeConstraintIRAnalysis *shape_analysis);
+bool Divide_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool ElementwisePowOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool GreaterThanOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool GreaterThan_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool LessThanOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool LessThan_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool LogicalAndOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool LogicalAnd_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool MultiplyOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
@@ -35,11 +67,10 @@ bool Multiply_OpInferSymbolicShape(
 bool MultiplySr_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
-bool DivideOpInferSymbolicShape(pir::Operation *op,
-                                pir::ShapeConstraintIRAnalysis *shape_analysis);
-bool Divide_OpInferSymbolicShape(
+bool NotEqualOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
-bool ElementwisePowOpInferSymbolicShape(
+bool NotEqual_OpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 }  // namespace paddle::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index f2577ce80fe67..092ecc89cb13f 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -1008,109 +1008,12 @@ bool Where_OpInferSymbolicShape(
   return WhereOpInferSymbolicShape(op, shape_analysis);
 }
 
-bool AssignOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool Assign_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return AssignOpInferSymbolicShape(op, shape_analysis);
-}
-
-bool BitwiseAndOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool BitwiseAnd_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return BitwiseAndOpInferSymbolicShape(op, shape_analysis);
-}
-
 bool FeedOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool GreaterThanOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  // This Op has NO InferMeta in yaml, just return true
   return true;
 }
 
-bool GreaterThan_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return GreaterThanOpInferSymbolicShape(op, shape_analysis);
-}
-
-bool IncrementOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool Increment_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return IncrementOpInferSymbolicShape(op, shape_analysis);
-}
-
-bool LessThanOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool LessThan_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return LessThanOpInferSymbolicShape(op, shape_analysis);
-}
-
-bool LogicalAndOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool LogicalAnd_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return LogicalAndOpInferSymbolicShape(op, shape_analysis);
-}
-
-bool LogicalNotOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool LogicalNot_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return LogicalNotOpInferSymbolicShape(op, shape_analysis);
-}
-
-bool NotEqualOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool NotEqual_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return NotEqualOpInferSymbolicShape(op, shape_analysis);
-}
-
 bool TopPSamplingOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -1118,18 +1021,6 @@ bool TopPSamplingOpInferSymbolicShape(
   return true;
 }
 
-bool LogOpInferSymbolicShape(pir::Operation *op,
-                             pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
-  return true;
-}
-
-bool Log_OpInferSymbolicShape(pir::Operation *op,
-                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  return LogOpInferSymbolicShape(op, shape_analysis);
-}
-
 bool ExpandAsOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
index 6626b88226d5e..7c61075247ce0 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h
@@ -100,66 +100,12 @@ bool WhereOpInferSymbolicShape(pir::Operation *op,
 bool Where_OpInferSymbolicShape(pir::Operation *op,
                                 pir::ShapeConstraintIRAnalysis *shape_analysis);
 
-bool AssignOpInferSymbolicShape(pir::Operation *op,
-                                pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool Assign_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool BitwiseAndOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool BitwiseAnd_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool FeedOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis);
 
-bool GreaterThanOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool GreaterThan_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool IncrementOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool Increment_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool LessThanOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool LessThan_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool LogicalAndOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool LogicalAnd_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool LogicalNotOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool LogicalNot_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool NotEqualOpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool NotEqual_OpInferSymbolicShape(
-    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool TopPSamplingOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
-bool LogOpInferSymbolicShape(pir::Operation *op,
-                             pir::ShapeConstraintIRAnalysis *shape_analysis);
-
-bool Log_OpInferSymbolicShape(pir::Operation *op,
-                              pir::ShapeConstraintIRAnalysis *shape_analysis);
-
 bool ExpandAsOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
index c77c640222f97..571b90f7ff552 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.cc
@@ -36,6 +36,16 @@ bool Abs_OpInferSymbolicShape(pir::Operation *op,
   return SameOperandsAndResultShape(op, shape_analysis);
 }
 
+bool AssignOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return SameOperandsAndResultShape(op, shape_analysis);
+}
+
+bool Assign_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return AssignOpInferSymbolicShape(op, shape_analysis);
+}
+
 bool CastOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
@@ -56,6 +66,36 @@ bool Exp_OpInferSymbolicShape(pir::Operation *op,
   return SameOperandsAndResultShape(op, shape_analysis);
 }
 
+bool IncrementOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return SameOperandsAndResultShape(op, shape_analysis);
+}
+
+bool Increment_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return IncrementOpInferSymbolicShape(op, shape_analysis);
+}
+
+bool LogOpInferSymbolicShape(pir::Operation *op,
+                             pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return SameOperandsAndResultShape(op, shape_analysis);
+}
+
+bool Log_OpInferSymbolicShape(pir::Operation *op,
+                              pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return LogOpInferSymbolicShape(op, shape_analysis);
+}
+
+bool LogicalNotOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return SameOperandsAndResultShape(op, shape_analysis);
+}
+
+bool LogicalNot_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  return LogicalNotOpInferSymbolicShape(op, shape_analysis);
+}
+
 bool FullWithTensorOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   return SameOperandsAndResultShape(op, shape_analysis);
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
index 19fcfac4ca5dc..706bc500048b5 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
@@ -22,6 +22,12 @@ bool AbsOpInferSymbolicShape(pir::Operation *op,
 bool Abs_OpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis);
 
+bool AssignOpInferSymbolicShape(pir::Operation *op,
+                                pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool Assign_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool CastOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool Cast_OpInferSymbolicShape(pir::Operation *op,
@@ -35,6 +41,24 @@ bool Exp_OpInferSymbolicShape(pir::Operation *op,
 bool FullWithTensorOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
+bool IncrementOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool Increment_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool LogOpInferSymbolicShape(pir::Operation *op,
+                             pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool Log_OpInferSymbolicShape(pir::Operation *op,
+                              pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool LogicalNotOpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
+bool LogicalNot_OpInferSymbolicShape(
+    pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool PowOpInferSymbolicShape(pir::Operation *op,
                              pir::ShapeConstraintIRAnalysis *shape_analysis);
 bool Pow_OpInferSymbolicShape(pir::Operation *op,

From 1185aad03e212afc6ef5b0b11e23521c9ae1719e Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:47:57 +0800
Subject: [PATCH 09/82] [PIR] executor support keyword argument. (#61657)

* [PIR] executor support keyword argument.

* fix test_cinn_sub_graph unit test failed
---
 .../group_merge/op_with_group_merge_pass.cc   |  5 +-
 .../eager/to_static/run_program_op_node.h     |  5 ++
 .../pir_adaptor/pir_adaptor_util.cc           | 13 +++++
 .../pir/transforms/pd_op_to_kernel_pass.cc    | 33 +++++++++++++
 .../pir/transforms/sub_graph_detector.cc      |  9 ++--
 paddle/fluid/pybind/pir.cc                    | 49 ++++++++-----------
 paddle/pir/include/core/block_argument.h      | 10 ++--
 paddle/pir/src/core/block.cc                  |  2 +-
 paddle/pir/src/core/block_argument.cc         | 30 +++++++++++-
 .../jit/dy2static/pir_partial_program.py      |  2 +
 10 files changed, 118 insertions(+), 40 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
index 12a403740b977..1fdb03eee3e9d 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/op_with_group_merge_pass.cc
@@ -73,8 +73,9 @@ std::unordered_set<pir::Operation*> GetProducerOps(pir::Operation* op) {
     if (!operand || !(operand.source())) {
       continue;
     }
-    auto* source_op = operand.source().defining_op();
-    producers.insert(source_op);
+    if (auto* source_op = operand.source().defining_op()) {
+      producers.insert(source_op);
+    }
   }
   return producers;
 }
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 00f6b04781cbc..17cb367e72980 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -183,6 +183,11 @@ static auto GetNameFromValue(const ::pir::Block *block,
                              bool is_input) {
   // we use name here, later value is used directly.
   std::unordered_map<::pir::Value, std::string> value2name;
+  if (is_input) {
+    for (auto &kwarg : block->kwargs()) {
+      value2name[kwarg.second] = kwarg.first;
+    }
+  }
   for (auto &op : *block) {
     std::string name;
     if (is_input && op.name() == "pd_op.data") {
diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
index d9005802cd24a..aa9003cb164f9 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
@@ -743,6 +743,19 @@ void BuildScope(const pir::Block& block,
           << GenScopeTreeDebugInfo(
                  const_cast<Scope*>(value_exe_info->GetScope()->root()));
 
+  VLOG(6) << "Start handle keyword blockargument!";
+  for (auto& kwarg : block.kwargs()) {
+    VLOG(6) << "link keyword blockargument in variable"
+            << value_exe_info->GetScope();
+    Variable* var = value_exe_info->GetScope()->FindVar(kwarg.first);
+    PADDLE_ENFORCE(var,
+                   paddle::platform::errors::InvalidArgument(
+                       "The variable %s shoud exist", kwarg.first));
+
+    value_exe_info->Add(kwarg.second, kwarg.first);
+  }
+  VLOG(6) << "Finished handle keyword blockargument!";
+
   for (auto& op : block) {
     std::string op_name = op.name();
     if (op.attributes().count("op_name")) {
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index d23819cd5b50c..9cd2c89eda866 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -2537,6 +2537,39 @@ void ProcessBlock(
     bool for_if_block) {
   auto inputs_by_data_op = GetInputsByDataOp(block);
 
+  for (auto& [keyword, arg] : block->kwargs()) {
+    auto new_arg = new_block->AddKwarg(keyword, arg.type());
+    (*map_value_pair)[arg] = new_arg;
+    if (auto dense_tensor_type = arg.type().dyn_cast<DenseTensorType>()) {
+      new_arg.set_type(AllocatedDenseTensorType::get(
+          ctx, phi::CPUPlace(), dense_tensor_type));
+    }
+  }
+  if (platform::is_gpu_place(place)) {
+    for (auto& [keyword, arg] : block->kwargs()) {
+      if (auto dense_tensor_type = arg.type().dyn_cast<DenseTensorType>()) {
+        auto dtype = dense_tensor_type.dtype();
+        phi::KernelKey shadow_key{
+            phi::Backend::GPU, phi::DataLayout::ANY, TransToPhiDataType(dtype)};
+        std::unordered_map<std::string, pir::Attribute> attr_map{
+            {"op_name", pir::StrAttribute::get(ctx, "pd_op.shadow_feed")},
+            {"kernel_name", pir::StrAttribute::get(ctx, "shadow_feed")},
+            {"kernel_key", KernelAttribute::get(ctx, shadow_key)}};
+
+        auto out_type =
+            AllocatedDenseTensorType::get(ctx, place, dense_tensor_type);
+
+        pir::OpInfo phi_kernel_op_info =
+            ctx->GetRegisteredOpInfo(PhiKernelOp::name());
+        pir::Operation* shadow_op = pir::Operation::Create(
+            {(*map_value_pair)[arg]}, attr_map, {out_type}, phi_kernel_op_info);
+
+        new_block->push_back(shadow_op);
+        (*map_value_pair)[arg] = shadow_op->result(0);
+      }
+    }
+  }
+
   for (auto iter = block->begin(); iter != block->end(); ++iter) {
     pir::Operation* op_item = &(*iter);
     VLOG(6) << "op name " << op_item->name();
diff --git a/paddle/fluid/pir/transforms/sub_graph_detector.cc b/paddle/fluid/pir/transforms/sub_graph_detector.cc
index 051bbb4c2d224..dcb55412feb1f 100644
--- a/paddle/fluid/pir/transforms/sub_graph_detector.cc
+++ b/paddle/fluid/pir/transforms/sub_graph_detector.cc
@@ -83,7 +83,7 @@ std::vector<pir::Operation*> InverselyTopologicalSort(pir::Block* block) {
       }
       auto* defined_op = operand.source().defining_op();
       --pending_count[defined_op];
-      if (pending_count[defined_op] == 0) {
+      if (defined_op && pending_count[defined_op] == 0) {
         queue.push(defined_op);
       }
     }
@@ -109,7 +109,7 @@ std::vector<pir::Operation*> GetProducerOpsReverseSort(
       continue;
     }
     auto* source_op = operand.source().defining_op();
-    if (!producers.count(source_op)) {
+    if (source_op && !producers.count(source_op)) {
       producers.insert(source_op);
       PADDLE_ENFORCE(
           op2id.count(source_op),
@@ -134,8 +134,9 @@ std::unordered_set<pir::Operation*> GetProducerOps(pir::Operation* op) {
     if (!operand || !(operand.source())) {
       continue;
     }
-    auto* source_op = operand.source().defining_op();
-    producers.insert(source_op);
+    if (auto* source_op = operand.source().defining_op()) {
+      producers.insert(source_op);
+    }
   }
   return producers;
 }
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index fe52599c88962..99e1b624edefa 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -198,7 +198,11 @@ std::string GetValueInfo(Value v) {
     ss << "define_op_name=" << op_result.owner()->name();
     ss << ", index=" << op_result.index();
   } else if (auto arg = v.dyn_cast<BlockArgument>()) {
-    ss << "block_arg, index = " << arg.index();
+    if (arg.is_kwarg()) {
+      ss << "keyword block_arg, keyword = " << arg.keyword();
+    } else {
+      ss << "position block_arg, index = " << arg.index();
+    }
   }
   if (!v.type()) {
     ss << ", dtype=<<NULL TYPE>>";
@@ -408,6 +412,7 @@ void BindBlock(py::module *m) {
            })
       .def("__len__", [](Block &self) { return self.size(); })
       .def("args", &Block::args, return_value_policy::reference)
+      .def("kwargs", &Block::kwargs, return_value_policy::reference)
       .def(
           "remove_op",
           [](Block &self, Operation *op) {
@@ -1116,7 +1121,7 @@ SplitedResult SplitForwardBackward(
   std::unordered_set<pir::Value> backward_inputs;
   std::tie(middle_values, backward_inputs) = AnalysisMiddleVariable(
       program, forward_in_out_values, forward_range, backward_range);
-  pir::Builder backward_builder = pir::Builder(ctx, backward_program->block());
+  pir::Block &backward_block = *backward_program->block();
   bool has_backward = (backward_range[1] > backward_range[0]);
 
   // forward program construct.
@@ -1137,28 +1142,14 @@ SplitedResult SplitForwardBackward(
   pir::IrMapping backward_mapper;
   auto &backward_value_map = backward_mapper.GetMutableMap<pir::Value>();
   int counter = 0;
-  auto create_data_fn = [&backward_builder,
-                         &backward_inputs,
-                         &backward_value_map,
-                         &counter](const pir::Value &v) {
-    if (v.impl() == nullptr || !backward_inputs.count(v)) {
-      return;
+  auto create_kwarg_fn = [&backward_block,
+                          &backward_inputs,
+                          &backward_value_map,
+                          &counter](const pir::Value &v) {
+    if (v && backward_inputs.count(v)) {
+      backward_value_map[v] = backward_block.AddKwarg(
+          "input_" + std::to_string(counter++), v.type());
     }
-    auto value_type = v.type().dyn_cast<DenseTensorType>();
-    auto dtype = paddle::dialect::TransToPhiDataType(value_type.dtype());
-    auto shape = common::vectorize(value_type.dims());
-    auto place = phi::Place();
-
-    paddle::dialect::DataOp op =
-        backward_builder.Build<paddle::dialect::DataOp>(
-            std::string("input_") + std::to_string(counter),
-            shape,
-            dtype,
-            place);
-    counter += 1;
-    pir::Value target = op->results()[0].Value::impl();
-    target.set_type(v.type());
-    backward_value_map[v] = target;
   };
 
   auto create_output_fn_forward = [&ctx,
@@ -1227,21 +1218,23 @@ SplitedResult SplitForwardBackward(
     VLOG(4) << "Create pd.data for backward program: fo, start with input_"
             << counter;
     std::for_each(
-        forward_outputs.begin(), forward_outputs.end(), create_data_fn);
+        forward_outputs.begin(), forward_outputs.end(), create_kwarg_fn);
     VLOG(4) << "Create pd.data for backward program: fx, start with input_"
             << counter;
-    std::for_each(forward_inputs.begin(), forward_inputs.end(), create_data_fn);
+    std::for_each(
+        forward_inputs.begin(), forward_inputs.end(), create_kwarg_fn);
     VLOG(4) << "Create pd.data for backward program: fp, start with input_"
             << counter;
-    std::for_each(forward_params.begin(), forward_params.end(), create_data_fn);
+    std::for_each(
+        forward_params.begin(), forward_params.end(), create_kwarg_fn);
     VLOG(4) << "Create pd.data for backward program: fm, start with input_"
             << counter;
-    std::for_each(middle_values.begin(), middle_values.end(), create_data_fn);
+    std::for_each(middle_values.begin(), middle_values.end(), create_kwarg_fn);
     VLOG(4) << "Create pd.data for backward program: fo_g, start with input_"
             << counter;
     std::for_each(forward_outputs_grads.begin(),
                   forward_outputs_grads.end(),
-                  create_data_fn);
+                  create_kwarg_fn);
     VLOG(4) << "Create pd.data for backward program end. input_" << counter;
   }
 
diff --git a/paddle/pir/include/core/block_argument.h b/paddle/pir/include/core/block_argument.h
index 017c0a6544f72..3ddf7847fd8a2 100644
--- a/paddle/pir/include/core/block_argument.h
+++ b/paddle/pir/include/core/block_argument.h
@@ -24,8 +24,8 @@ class BlockArgumentImpl;
 }  // namespace detail
 
 ///
-/// \brief BlockArgument class represents the value defined by a result of
-/// operation. This class only provides interfaces, for specific implementation,
+/// \brief BlockArgument class represents the value defined by a argument of
+/// block. This class only provides interfaces, for specific implementation,
 /// see Impl class.
 ///
 class IR_API BlockArgument : public Value {
@@ -33,6 +33,8 @@ class IR_API BlockArgument : public Value {
   BlockArgument() = default;
   Block *owner() const;
   uint32_t index() const;
+  const std::string &keyword() const;
+  bool is_kwarg() const;
 
   const AttributeMap &attributes() const;
   Attribute attribute(const std::string &key) const;
@@ -44,6 +46,9 @@ class IR_API BlockArgument : public Value {
 
   /// create a new argument with the given type and owner.
   static BlockArgument Create(Type type, Block *owner, uint32_t index);
+  static BlockArgument Create(Type type,
+                              Block *owner,
+                              const std::string &keyword);
   /// Destroy the argument.
   void Destroy();
   /// set the position in the block argument list.
@@ -56,5 +61,4 @@ class IR_API BlockArgument : public Value {
   static bool classof(Value value);
   static BlockArgument dyn_cast_from(Value value);
 };
-
 }  // namespace pir
diff --git a/paddle/pir/src/core/block.cc b/paddle/pir/src/core/block.cc
index 9e4c6179e5af2..258f681b303cb 100644
--- a/paddle/pir/src/core/block.cc
+++ b/paddle/pir/src/core/block.cc
@@ -126,7 +126,7 @@ Value Block::AddKwarg(const std::string &keyword, Type type) {
   IR_ENFORCE(kwargs_.find(keyword) == kwargs_.end(),
              "Add keyword (%s) argument which has been existed.",
              keyword.c_str());
-  auto arg = BlockArgument::Create(type, this, 0);
+  auto arg = BlockArgument::Create(type, this, keyword);
   kwargs_[keyword] = arg;
   return arg;
 }
diff --git a/paddle/pir/src/core/block_argument.cc b/paddle/pir/src/core/block_argument.cc
index 19c5e2b0ef917..99a799e9f592e 100644
--- a/paddle/pir/src/core/block_argument.cc
+++ b/paddle/pir/src/core/block_argument.cc
@@ -50,7 +50,15 @@ class BlockArgumentImpl : public ValueImpl {
 
  private:
   BlockArgumentImpl(Type type, Block *owner, uint32_t index)
-      : ValueImpl(type, BLOCK_ARG_IDX), owner_(owner), index_(index) {}
+      : ValueImpl(type, BLOCK_ARG_IDX),
+        owner_(owner),
+        index_(index),
+        is_kwarg_(false) {}
+  BlockArgumentImpl(Type type, Block *owner, const std::string &keyword)
+      : ValueImpl(type, BLOCK_ARG_IDX),
+        owner_(owner),
+        is_kwarg_(true),
+        keyword_(keyword) {}
 
   ~BlockArgumentImpl();
   // access construction and owner
@@ -58,7 +66,9 @@ class BlockArgumentImpl : public ValueImpl {
 
   AttributeMap attributes_;
   Block *owner_;
-  uint32_t index_;
+  uint32_t index_ = 0xFFFFFFFF;
+  bool is_kwarg_;
+  std::string keyword_ = "uninitialized_keyword";
 };
 
 BlockArgumentImpl::~BlockArgumentImpl() {
@@ -85,6 +95,16 @@ uint32_t BlockArgument::index() const {
   return IMPL_->index_;
 }
 
+const std::string &BlockArgument::keyword() const {
+  CHECK_NULL_IMPL(keyword);
+  return IMPL_->keyword_;
+}
+
+bool BlockArgument::is_kwarg() const {
+  CHECK_NULL_IMPL(is_kwarg);
+  return IMPL_->is_kwarg_;
+}
+
 const AttributeMap &BlockArgument::attributes() const {
   CHECK_NULL_IMPL(attributes_);
   return IMPL_->attributes_;
@@ -101,6 +121,12 @@ void BlockArgument::set_attribute(const std::string &key, Attribute value) {
 BlockArgument BlockArgument::Create(Type type, Block *owner, uint32_t index) {
   return new detail::BlockArgumentImpl(type, owner, index);
 }
+
+BlockArgument BlockArgument::Create(Type type,
+                                    Block *owner,
+                                    const std::string &keyword) {
+  return new detail::BlockArgumentImpl(type, owner, keyword);
+}
 /// Destroy the argument.
 void BlockArgument::Destroy() {
   if (impl_) {
diff --git a/python/paddle/jit/dy2static/pir_partial_program.py b/python/paddle/jit/dy2static/pir_partial_program.py
index c8068a59a0f17..2a55277fd77b4 100644
--- a/python/paddle/jit/dy2static/pir_partial_program.py
+++ b/python/paddle/jit/dy2static/pir_partial_program.py
@@ -134,6 +134,8 @@ def get_value_name_map(self):
     def _get_value_name_map_from_program(cls, program):
         ret = ValueDict()
         ret[fake_value()] = "FakeVar"
+        for keyword, arg in program.global_block().kwargs().items():
+            ret[arg] = keyword
         for op in program.global_block().ops:
             if op.name() == "builtin.set_parameter":
                 ret[op.operand(0).source()] = op.attrs()["parameter_name"]

From 8ee6609278a91e808b7d0140bc84f327f8d2c445 Mon Sep 17 00:00:00 2001
From: diadestiny <44188454+diadestiny@users.noreply.github.com>
Date: Fri, 23 Feb 2024 10:22:11 +0800
Subject: [PATCH 10/82] [SOT][3.12] Support `RETURN_CONST` opcode in Python
 3.12 (#61964)

---
 .../jit/sot/opcode_translator/executor/opcode_executor.py  | 7 +++++++
 .../opcode_translator/executor/opcode_inline_executor.py   | 4 ++++
 test/sot/skip_files_py312                                  | 7 -------
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index 5f193cebc085d..ccfae0a888f02 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -2114,6 +2114,13 @@ def RETURN_VALUE(self, instr: Instruction):
             len(self.stack) == 1
         ), f"Stack must have one element, but get {len(self.stack)} elements."
         ret_val = self.stack.pop()
+        return self.compile_return(ret_val)
+
+    def RETURN_CONST(self, instr: Instruction):
+        ret_const = self._co_consts[instr.arg]
+        return self.compile_return(ret_const)
+
+    def compile_return(self, ret_val):
         compile_fn = self._graph.get_compiled_fn(ret_val)
         if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
             self.new_code = None
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py
index 4baa64e884107..3832d05f04448 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py
@@ -283,6 +283,10 @@ def RETURN_VALUE(self, instr: Instruction):
         self.return_value = self.stack.pop()
         return Stop(state="Return")
 
+    def RETURN_CONST(self, instr: Instruction):
+        self.return_value = self._co_consts[instr.arg]
+        return Stop(state="Return")
+
     def _break_graph_when_if(self, result, instr: Instruction):
         """
         Helper method to raise a BreakGraphError when breaking the graph in a jump operation.
diff --git a/test/sot/skip_files_py312 b/test/sot/skip_files_py312
index 3cc5b8d4439e0..d79956533e2d3 100644
--- a/test/sot/skip_files_py312
+++ b/test/sot/skip_files_py312
@@ -14,14 +14,7 @@
 ./test_guard_user_defined_fn.py
 ./test_inplace_api.py
 ./test_min_graph_size.py
-./test_output_restoration.py
 ./test_side_effects.py
-./test_simulate_initialize.py
-./test_sir_rollback.py
 ./test_sot_cost_model.py
-./test_sot_export.py
 ./test_sot_resnet.py
 ./test_sot_resnet50_backward.py
-./test_specialization.py
-./test_str_format.py
-./test_builtin_bool.py

From e6510e8f81858e39aee6448182a0d5ef5cda47c1 Mon Sep 17 00:00:00 2001
From: Difer <707065510@qq.com>
Date: Fri, 23 Feb 2024 11:00:37 +0800
Subject: [PATCH 11/82] =?UTF-8?q?=E3=80=90PIR=20Dist=20Op=20Reg=20No.13?=
 =?UTF-8?q?=E3=80=91=20reg=20partial=5Fsend=20(#60484)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* reg partial_send

* fix cmake

* fix import

* fix cmake

* try to remove out

* add partitial_send to prim gen blacklist

* fix typo
---
 .../pir/dialect/op_generator/ops_api_gen.py   |  1 +
 paddle/fluid/pir/dialect/operator/ir/ops.yaml | 10 +++++
 .../fluid/pir/dialect/operator/utils/utils.cc |  3 +-
 paddle/fluid/primitive/codegen/gen.py         |  1 +
 paddle/phi/api/yaml/op_compat.yaml            |  4 ++
 paddle/phi/infermeta/unary.cc                 | 28 +++++++++++++
 paddle/phi/infermeta/unary.h                  |  7 ++++
 test/ir/pir/translator/CMakeLists.txt         |  1 +
 .../test_partial_send_translator.py           | 40 +++++++++++++++++++
 9 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 test/ir/pir/translator/test_partial_send_translator.py

diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 13c656207f1b8..8328e406ae0e6 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -156,6 +156,7 @@
     'c_reduce_min_',
     'push_sparse_v2',
     'push_sparse_v2_',
+    'partial_send',
 ]
 
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
index d32c1d8b7a6bd..4fcd90c99fe0a 100644
--- a/paddle/fluid/pir/dialect/operator/ir/ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -1654,6 +1654,16 @@
   kernel:
     func: onednn_to_paddle_layout
 
+- op: partial_send
+  args: (Tensor x, int ring_id = 0, int peer = 0, bool use_calc_stream = false, int num = 1, int id = 0)
+  output :
+  infer_meta:
+    func: PartialSendInferMeta
+    param: [x, ring_id, peer, use_calc_stream, num, id]
+  kernel:
+    func: partial_send
+    param: [x, ring_id, peer, use_calc_stream, num, id]
+
 - op: sparse_momentum
   args: (Tensor param, Tensor grad, Tensor velocity, Tensor index, Tensor learning_rate, Tensor master_param,float mu, Scalar axis=0, bool use_nesterov=false,str regularization_method="", float regularization_coeff=0.0f, bool multi_precision=false, float rescale_grad=1.0f)
   output: Tensor(param_out), Tensor(velocity_out), Tensor(master_param_out)
diff --git a/paddle/fluid/pir/dialect/operator/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
index ea8002c1c842f..b4bad427567b7 100644
--- a/paddle/fluid/pir/dialect/operator/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -80,7 +80,8 @@ const std::unordered_set<std::string> LegacyOpList = {
     paddle::onednn::dialect::MultiGruOp::name(),
 #endif
     CReduceMinOp::name(),
-    PushSparseV2Op::name()};
+    PushSparseV2Op::name(),
+    PartialSendOp::name()};
 
 enum class AttrType {
   UNDEFINED = 0,
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 3c6791a344a8b..fb1579968423a 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -52,6 +52,7 @@
     "embedding_sparse_grad",
     "embedding_grad",
     "full",
+    "partial_send",
 ]
 
 # prim op with one input and one output, with no attribute
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 237724dabe69f..53e0cea953b87 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -3653,6 +3653,10 @@
   outputs :
     out : Out
 
+- op: partial_send
+  inputs :
+    x : X
+
 - op: read_from_array
   inputs:
     array : X
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 5092072f5a87c..3b47085eee9b1 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2915,6 +2915,34 @@ void Pad3dInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void PartialSendInferMeta(const MetaTensor& x,
+                          int ring_id,
+                          int peer,
+                          bool use_calc_stream,
+                          int num,
+                          int id) {
+  PADDLE_ENFORCE_GE(
+      peer,
+      0,
+      phi::errors::InvalidArgument(
+          "The peer (%d) for partial_send op must be non-negative.", peer));
+  PADDLE_ENFORCE_GE(
+      ring_id,
+      0,
+      phi::errors::InvalidArgument(
+          "The ring_id (%d) for partial_send op must be non-negative.",
+          ring_id));
+  PADDLE_ENFORCE_GE(num,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The num (%d) for partial_send op must >=1", num));
+  PADDLE_ENFORCE_EQ(
+      (id >= 0 && id < num),
+      true,
+      phi::errors::InvalidArgument(
+          "The id (%d) for partial_send op must >=0 and <num (%d)", id, num));
+}
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 5f587e27cf348..a9f5f2eb1a13c 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -434,6 +434,13 @@ void Pad3dInferMeta(const MetaTensor& x,
                     MetaTensor* out,
                     MetaConfig config = MetaConfig());
 
+void PartialSendInferMeta(const MetaTensor& x,
+                          int ring_id,
+                          int peer,
+                          bool use_calc_stream,
+                          int num,
+                          int id);
+
 void PixelShuffleInferMeta(const MetaTensor& x,
                            int upscale_factor,
                            const std::string& data_format,
diff --git a/test/ir/pir/translator/CMakeLists.txt b/test/ir/pir/translator/CMakeLists.txt
index fe9a5ccd74115..2dd89d3406c92 100644
--- a/test/ir/pir/translator/CMakeLists.txt
+++ b/test/ir/pir/translator/CMakeLists.txt
@@ -9,6 +9,7 @@ list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_c_allreduce_min_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_c_allreduce_prod_translator)
 list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST
      test_distributed_lookup_table_translate)
+list(APPEND DISTRIBUTED_OP_TRANSLATOR_TEST test_partial_send_translator)
 
 if(NOT WITH_DISTRIBUTE)
   list(REMOVE_ITEM TEST_INTERP_CASES ${DISTRIBUTED_OP_TRANSLATOR_TEST})
diff --git a/test/ir/pir/translator/test_partial_send_translator.py b/test/ir/pir/translator/test_partial_send_translator.py
new file mode 100644
index 0000000000000..9f133f5274969
--- /dev/null
+++ b/test/ir/pir/translator/test_partial_send_translator.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import test_op_translator
+
+import paddle
+from paddle.base.layer_helper import LayerHelper
+
+
+class TestPartialSendTranslator(test_op_translator.TestOpTranslator):
+    def append_op(self):
+        self.op_type = "partial_send"
+        x = paddle.ones(shape=(100, 2, 3), dtype='float32')
+        attrs = {'ring_id': 0, 'root_id': 0, 'use_calc_stream': False}
+        helper = LayerHelper(self.op_type)
+        helper.append_op(
+            type=self.op_type,
+            inputs={"X": x},
+            attrs=attrs,
+        )
+
+    def test_translator(self):
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 81a0c5b216384b85c85a4c919653136752c48519 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Fri, 23 Feb 2024 11:01:07 +0800
Subject: [PATCH 12/82] [PIR+CINN]Fix cluster node bug (#61890)

* fix cluster node bug

* polish cinn group cluster pass

* update

* polish code

* polish cluster node pass
---
 .../transforms/cinn_group_cluster_pass.cc     | 485 +++++++++++-------
 .../transforms/cinn_group_cluster_pass.h      |   3 +
 paddle/cinn/hlir/framework/pir/utils.h        |  10 +-
 test/cpp/pir/cinn/ir_op_cluster_test.cc       |   2 +-
 4 files changed, 305 insertions(+), 195 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
index 046eb7442e8a4..b36afc9bd056f 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
@@ -73,7 +73,7 @@ std::unordered_set<::pir::Value> GetListOutsideInput(
   std::unordered_set<pir::Value> outside_ops;
   auto block_inner_output = GetInnerGeneValue(ops);
 
-  for (auto& op : ops) {
+  for (const auto& op : ops) {
     for (size_t i = 0; i < op->num_operands(); ++i) {
       if (!block_inner_output.count(op->operand_source(i)) &&
           !outside_ops.count(op->operand_source(i))) {
@@ -95,17 +95,33 @@ bool IsLastReshape(::pir::Operation* input_op) {
   return false;
 }
 
+std::string BuildGroupId(const ::pir::GroupOpsVec& ops_list) {
+  std::string group_id;
+  for (const auto& op : ops_list) {
+    if (group_id != "") {
+      group_id += "_";
+    }
+    group_id += op->name();
+  }
+
+  return group_id;
+}
 struct GroupClusterNode {
+  // all the ops in each Node
   std::vector<::pir::Operation*> ops;
+  // group kind
   cinn::hlir::framework::OpPatternKind group_kind{
       cinn::hlir::framework::kElementWise};
+  // reduce_axis if kind is Reduce else empty
   std::vector<int64_t> reduce_axis;
+  // if kind is reduce, loop ranges equal input dim
+  // if kind id elementwise or broadcast, loop ranges equal output dim
   std::vector<int64_t> loop_ranges;
 
   std::unordered_map<::pir::Operation*, std::vector<ScheduleInfoNode>>
       alignment_schedule_info;
 
-  std::unordered_set<::pir::Value> GetOutsideInput() {
+  std::unordered_set<::pir::Value> GetOutsideInput() const {
     return GetListOutsideInput(ops);
   }
 
@@ -126,7 +142,7 @@ struct GroupClusterNode {
     }
     ss << "\n";
 
-    for (auto op : ops) {
+    for (const auto& op : ops) {
       printer.PrintOperation(op);
       if (alignment_schedule_info.count(op)) {
         for (auto& node : alignment_schedule_info.at(op)) {
@@ -139,100 +155,134 @@ struct GroupClusterNode {
     return ss.str();
   }
 
-  void GenerateOutputValue(
-      const std::unordered_set<::pir::Value>& outside_need_value) {
-    output_value.clear();
-    for (auto& op : ops) {
-      if (op->name() == "cf.yield") {
-        continue;
-      }
-
-      std::unordered_set<::pir::Value> inserted_val;
-      for (size_t i = 0; i < op->num_results(); ++i) {
-        if (outside_need_value.count(op->result(i))) {
-          if (!inserted_val.count(op->result(i))) {
-            output_value.push_back(op->result(i));
-
-            inserted_val.insert(op->result(i));
-          }
-        }
-      }
-    }
-  }
-
   void MergeNode(const GroupClusterNode& node,
-                 const ScheduleInfoNode& sch_node) {
+                 const ScheduleInfoNode& inner_sch_node) {
     std::unordered_set<::pir::Operation*> inner_ops(ops.begin(), ops.end());
 
-    if (sch_node.type != "") {
-      // all the data need add sch node
-      for (auto op : ops) {
-        alignment_schedule_info[op].push_back(sch_node);
+    if (inner_sch_node.type != hlir::framework::pir::ScheduleAlignType::kNone) {
+      for (const auto& op : ops) {
+        this->alignment_schedule_info[op].push_back(inner_sch_node);
       }
     }
-    for (auto op : node.ops) {
+    for (const auto& op : node.ops) {
       if (!inner_ops.count(op)) {
-        ops.push_back(op);
+        this->ops.push_back(op);
         // copy align info
         if (node.alignment_schedule_info.count(op)) {
-          alignment_schedule_info[op] = node.alignment_schedule_info.at(op);
+          this->alignment_schedule_info[op] =
+              node.alignment_schedule_info.at(op);
         }
-
-        // if( sch_node.type != "" )
-        // {
-        //   alignment_schedule_info[op].push_back( sch_node);
-        // }
       }
     }
 
-    if (group_kind < node.group_kind) {
-      group_kind = node.group_kind;
+    if (this->group_kind < node.group_kind) {
+      this->group_kind = node.group_kind;
     }
 
     if ((node.group_kind == cinn::hlir::framework::kReduction) ||
         (node.group_kind == cinn::hlir::framework::kBroadcast)) {
-      loop_ranges = node.loop_ranges;
+      this->loop_ranges = node.loop_ranges;
     }
     if (node.group_kind == cinn::hlir::framework::kReduction) {
-      reduce_axis = node.reduce_axis;
+      this->reduce_axis = node.reduce_axis;
     }
 
     if ((ops.size() == 1) && (ops.front()->name() == "cinn_op.reshape")) {
-      loop_ranges = node.loop_ranges;
+      this->loop_ranges = node.loop_ranges;
     }
   }
 
-  std::vector<::pir::Value> output_value;
+  void MergePreNode(const GroupClusterNode& node,
+                    const ScheduleInfoNode& pre_sch_node) {
+    std::unordered_set<::pir::Operation*> inner_ops(ops.begin(), ops.end());
+
+    for (const auto& op : node.ops) {
+      if (!inner_ops.count(op)) {
+        this->ops.push_back(op);
+        // copy align info
+        if (node.alignment_schedule_info.count(op)) {
+          this->alignment_schedule_info[op] =
+              node.alignment_schedule_info.at(op);
+        }
+
+        if (pre_sch_node.type !=
+            hlir::framework::pir::ScheduleAlignType::kNone) {
+          this->alignment_schedule_info[op].push_back(pre_sch_node);
+        }
+      }
+    }
+
+    if (group_kind < node.group_kind) {
+      this->group_kind = node.group_kind;
+    }
+  }
 };
 
-::pir::Operation* ReplaceWithGroupOp(pir::PatternRewriter* rewriter,
-                                     const ::pir::GroupOpsVec& group_ops,
-                                     const GroupClusterNode& node,
-                                     ::pir::IrMapping* ir_mapping) {
-  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
-  ctx->GetOrRegisterDialect<::pir::ControlFlowDialect>();
+std::vector<::pir::Value> GenerateOutputValue(
+    const std::vector<::pir::Operation*>& ops,
+    const std::unordered_map<::pir::Value, size_t>& outside_need_value) {
+  std::vector<::pir::Value> temp_out;
+  for (const auto& op : ops) {
+    if (op->isa<pir::YieldOp>()) {
+      continue;
+    }
+
+    std::unordered_set<::pir::Value> inserted_val;
+    for (size_t i = 0; i < op->num_results(); ++i) {
+      if (outside_need_value.count(op->result(i))) {
+        if (!inserted_val.count(op->result(i))) {
+          temp_out.push_back(op->result(i));
+
+          inserted_val.insert(op->result(i));
+        }
+      }
+    }
+  }
+  std::sort(temp_out.begin(),
+            temp_out.end(),
+            [&outside_need_value](::pir::Value a, ::pir::Value b) {
+              return outside_need_value.at(a) < outside_need_value.at(b);
+            });
+
+  return temp_out;
+}
+
+cinn::dialect::GroupInfo BuildGroupInfo(
+    const ::pir::GroupOpsVec& vec_new_op_list,
+    const GroupClusterNode& node,
+    const std::unordered_map<::pir::Operation*, std::vector<ScheduleInfoNode>>&
+        new_align_info) {
+  cinn::dialect::GroupInfo group_info({});
+  group_info.group_id = BuildGroupId(vec_new_op_list);
+  group_info.loop_ranges = node.loop_ranges;
+  group_info.reduce_axis = node.reduce_axis;
+  group_info.op_pattern_kind = node.group_kind;
+  group_info.alignment_schedule_info = new_align_info;
 
-  // step 1: Ensure the insert point and create GroupOp here.
-  auto* last_op = group_ops.back();
+  return group_info;
+}
 
-  auto output_value = node.output_value;
-  auto alignment_schedule_info = node.alignment_schedule_info;
+std::vector<pir::Type> BuildOutType(
+    const std::vector<::pir::Value>& output_value) {
   std::vector<pir::Type> output_types;
-  // std::vector<pir::Value> outputs = ::pir::AnalysisOutputs(group_ops);
 
-  //  ::pir::IrMapping ir_mapping;
-  for (auto& value : output_value) {
+  for (const auto& value : output_value) {
     output_types.emplace_back(value.type());
   }
 
-  ::pir::CloneOptions clone_options(false, true, false);
+  return output_types;
+}
 
+::pir::GroupOpsVec CloneOps(
+    const ::pir::GroupOpsVec& group_ops,
+    const GroupClusterNode& node,
+    ::pir::IrMapping* ir_mapping,
+    std::unordered_map<::pir::Operation*, std::vector<ScheduleInfoNode>>*
+        align_info) {
   std::vector<::pir::Operation*> vec_new_op_list;
-  std::unordered_map<::pir::Operation*, std::vector<ScheduleInfoNode>>
-      new_align_info;
+  ::pir::CloneOptions clone_options(false, true, false);
 
-  std::string group_id;
+  auto& alignment_schedule_info = node.alignment_schedule_info;
   for (auto op : group_ops) {
     auto new_op = op->Clone(*ir_mapping, clone_options);
     auto& shape_analysis =
@@ -244,24 +294,34 @@ ::pir::Operation* ReplaceWithGroupOp(pir::PatternRewriter* rewriter,
     }
 
     vec_new_op_list.push_back(new_op);
-    if (group_id != "") {
-      group_id += "_";
-    }
-    group_id += new_op->name();
 
     if (alignment_schedule_info.count(op)) {
-      new_align_info[new_op] = alignment_schedule_info.at(op);
+      align_info->emplace(new_op, alignment_schedule_info.at(op));
     }
   }
 
-  cinn::dialect::GroupInfo group_info({});
-  group_info.group_id = group_id;
-  group_info.loop_ranges = node.loop_ranges;
-  group_info.reduce_axis = node.reduce_axis;
-  group_info.op_pattern_kind = node.group_kind;
-  group_info.alignment_schedule_info = new_align_info;
+  return vec_new_op_list;
+}
+
+::pir::Operation* ReplaceWithGroupOp(
+    pir::PatternRewriter* rewriter,
+    const ::pir::GroupOpsVec& group_ops,
+    const GroupClusterNode& node,
+    const std::vector<::pir::Value> output_value,
+    ::pir::IrMapping* ir_mapping) {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<cinn::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<::pir::ControlFlowDialect>();
 
+  std::unordered_map<::pir::Operation*, std::vector<ScheduleInfoNode>>
+      new_align_info;
+
+  auto vec_new_op_list = CloneOps(group_ops, node, ir_mapping, &new_align_info);
+
+  auto group_info = BuildGroupInfo(vec_new_op_list, node, new_align_info);
   // step 2: Replace the old op with GroupOp.
+
+  auto output_types = BuildOutType(output_value);
   auto new_fusion_op =
       rewriter->Build<cinn::dialect::FusionOp>(output_types, group_info);
   pir::Block* fusion_block = new_fusion_op.block();
@@ -337,7 +397,7 @@ bool CanFuse(const GroupClusterNode& first,
     }
 
     if (first.loop_ranges != second.loop_ranges) {
-      sch_node->type = "broadcast";
+      sch_node->type = hlir::framework::pir::ScheduleAlignType::kBroadcast;
       sch_node->axis_info = first.reduce_axis;
       sch_node->factor_info = first.loop_ranges;
     }
@@ -350,17 +410,22 @@ bool CanFuse(const GroupClusterNode& first,
 
 std::vector<int> SortNodeList(std::vector<GroupClusterNode>* node_list_ptr,
                               std::vector<std::vector<int>>* pre_ids_ptr) {
+  // sort node list by topological sort
+  // TODO(phlrain): One node may have two pre node, need update here
   auto& node_list = *node_list_ptr;
   auto& pre_ids = *pre_ids_ptr;
-  std::unordered_set<::pir::Value> all_ouput_values;
-  for (auto& node : node_list) {
+  std::unordered_map<::pir::Value, size_t> in_out_values;
+  for (const auto& node : node_list) {
     auto node_outside_input = node.GetOutsideInput();
-    all_ouput_values.insert(node_outside_input.begin(),
-                            node_outside_input.end());
+    for (const auto& val : node_outside_input) {
+      size_t id = in_out_values.size();
+      in_out_values.emplace(val, id);
+    }
   }
 
-  for (auto& node : node_list) {
-    node.GenerateOutputValue(all_ouput_values);
+  std::vector<std::vector<pir::Value>> output_values_list;
+  for (const auto& node : node_list) {
+    output_values_list.push_back(GenerateOutputValue(node.ops, in_out_values));
   }
 
   std::vector<std::vector<int>> next_ids;
@@ -371,7 +436,7 @@ std::vector<int> SortNodeList(std::vector<GroupClusterNode>* node_list_ptr,
         continue;
       }
 
-      auto pre_out_list = node_list[i].output_value;
+      const auto& pre_out_list = output_values_list[i];
       auto next_in_set = node_list[j].GetOutsideInput();
 
       for (auto val : pre_out_list) {
@@ -462,7 +527,7 @@ void GetClusterNodeBasicInfo(::pir::Operation* op,
                            .dyn_cast<paddle::dialect::DenseTensorType>()
                            .dims());
 
-    sch_node->type = "broadcast";
+    sch_node->type = hlir::framework::pir::ScheduleAlignType::kBroadcast;
     sch_node->axis_info =
         cinn::dialect::ir::GetVectorAttr(op, "broadcast_axes");
     sch_node->factor_info = cinn::dialect::ir::GetVectorAttr(op, "out_shape");
@@ -489,6 +554,8 @@ bool CanOpMergeNode(
     const std::unordered_map<::pir::Operation*, GroupClusterNode>& op_path_info,
     ::pir::Operation* pre_op,
     ::pir::Operation* cur_op) {
+  const auto& node1 = op_path_info.at(pre_op);
+  const auto& node2 = op_path_info.at(cur_op);
   // reduce can not fuse with any op in first stage
   if (cinn::hlir::framework::pir::CompatibleInfo::OpKind(*pre_op) ==
       cinn::hlir::framework::kReduction) {
@@ -528,9 +595,71 @@ bool ShouldOutputPreNode(
   return false;
 }
 
-std::vector<GroupClusterNode> GroupSplit(cinn::dialect::GroupOp group_op) {
-  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+std::vector<GroupClusterNode> NodeMergeWithNode(
+    const std::vector<GroupClusterNode>& first_stage_output) {
+  // stage 2 merge
+  // for now we merge node in same pass
+  // only for vertial fuse
+  std::vector<GroupClusterNode> second_stage_output = first_stage_output;
+  while (true) {
+    bool fused = false;
+    std::vector<GroupClusterNode> temp_out;
+
+    std::set<int> fused_index;
+
+    std::vector<std::vector<int>> pre_ids_info;
+    auto sort_list = SortNodeList(&second_stage_output, &pre_ids_info);
+
+    std::reverse(sort_list.begin(), sort_list.end());
+    for (auto node_index : sort_list) {
+      if (fused_index.count(node_index)) {
+        continue;
+      }
+      const auto& node = second_stage_output[node_index];
+      const auto& pre_ids = pre_ids_info[node_index];
+
+      GroupClusterNode new_node = node;
+
+      for (auto pre_id : pre_ids) {
+        // get pre id
+
+        if (fused_index.count(pre_id)) {
+          continue;
+        }
+
+        // can new_node merge with pre_id node
+        const auto& pre_node = second_stage_output[pre_id];
+
+        ScheduleInfoNode sch_node;
+        auto can_fuse = CanFuse(pre_node, new_node, &sch_node);
 
+        if (can_fuse) {
+          // merge pre node to new_node
+          new_node.MergeNode(pre_node, sch_node);
+
+          fused_index.insert(pre_id);
+          fused = true;
+        } else {
+          temp_out.insert(temp_out.begin(), pre_node);
+        }
+      }
+      temp_out.insert(temp_out.end(), new_node);
+    }
+
+    if (temp_out.size() >= second_stage_output.size()) {
+      break;
+    }
+    second_stage_output.swap(temp_out);
+    if (fused == false) {
+      break;
+    }
+  }
+
+  return second_stage_output;
+}
+
+std::vector<GroupClusterNode> OpMergeWithOp(cinn::dialect::GroupOp group_op) {
+  // op merge with op
   auto inner_values = GetInnerGeneValue(group_op.GetOperators());
 
   std::unordered_map<::pir::Operation*, GroupClusterNode> op_path;
@@ -540,6 +669,7 @@ std::vector<GroupClusterNode> GroupSplit(cinn::dialect::GroupOp group_op) {
   std::vector<GroupClusterNode> first_stage_output;
 
   std::unordered_set<::pir::Operation*> yield_output_ops;
+  std::unordered_set<::pir::Operation*> first_output_ops;
   auto yield_op = op_list.back();
   for (size_t i = 0; i < yield_op->num_operands(); ++i) {
     if (yield_op->operand_source(i).defining_op()->result(0).use_count() == 1) {
@@ -547,6 +677,7 @@ std::vector<GroupClusterNode> GroupSplit(cinn::dialect::GroupOp group_op) {
     }
   }
 
+  // first stage op fuse op
   for (auto* op : op_list) {
     if (op->isa<::pir::YieldOp>()) {
       continue;
@@ -567,13 +698,16 @@ std::vector<GroupClusterNode> GroupSplit(cinn::dialect::GroupOp group_op) {
       }
 
       if (CanOpMergeNode(op_path, pre_op, op)) {
-        cluster_node.MergeNode(op_path.at(pre_op), sch_node);
+        cluster_node.MergePreNode(op_path.at(pre_op), sch_node);
       }
 
       // TODO(phlrain): should remove this strategy
       if (ShouldOutputPreNode(op_path, pre_op, op)) {
         // Can not merge here, should output pre_op cluster Node
-        first_stage_output.push_back(op_path[pre_op]);
+        if (!first_output_ops.count(pre_op)) {
+          first_stage_output.push_back(op_path[pre_op]);
+          first_output_ops.insert(pre_op);
+        }
         continue;
       }
     }
@@ -583,70 +717,28 @@ std::vector<GroupClusterNode> GroupSplit(cinn::dialect::GroupOp group_op) {
     if (yield_output_ops.count(op) ||
         cinn::hlir::framework::pir::CompatibleInfo::OpKind(*op) ==
             cinn::hlir::framework::kReduction) {
-      first_stage_output.push_back(op_path[op]);
+      // TODO(phlrain): yiled output no nedd to push into first stage output,
+      // Update here
+      if (!first_output_ops.count(op)) {
+        first_stage_output.push_back(op_path[op]);
+        first_output_ops.insert(op);
+      }
     }
   }
 
+  return first_stage_output;
+}
+
+std::vector<GroupClusterNode> GroupSplit(cinn::dialect::GroupOp group_op) {
+  // stage 1
+  auto first_stage_output = OpMergeWithOp(group_op);
+
   if (first_stage_output.size() <= 1) {
     return first_stage_output;
   }
-  // stage 2 merge
-  // for now we merge node in same pass
-  // only for vertial fuse
-  std::vector<GroupClusterNode> second_stage_output = first_stage_output;
-  while (true) {
-    bool fused = false;
-    std::vector<GroupClusterNode> temp_out;
-
-    std::set<int> fused_index;
-
-    std::vector<std::vector<int>> pre_ids_info;
-    auto sort_list = SortNodeList(&second_stage_output, &pre_ids_info);
-
-    std::reverse(sort_list.begin(), sort_list.end());
-    for (auto node_index : sort_list) {
-      if (fused_index.count(node_index)) {
-        continue;
-      }
-      auto& node = second_stage_output[node_index];
-      auto& pre_ids = pre_ids_info[node_index];
-
-      GroupClusterNode new_node = node;
-
-      for (auto pre_id : pre_ids) {
-        // get pre id
-
-        if (fused_index.count(pre_id)) {
-          continue;
-        }
-
-        // can new_node merge with pre_id node
-        auto& pre_node = second_stage_output[pre_id];
-
-        ScheduleInfoNode sch_node;
-        auto can_fuse = CanFuse(pre_node, new_node, &sch_node);
-
-        if (can_fuse) {
-          // merge pre node to new_node
-          new_node.MergeNode(pre_node, sch_node);
-
-          fused_index.insert(pre_id);
-          fused = true;
-        } else {
-          temp_out.insert(temp_out.begin(), pre_node);
-        }
-      }
-      temp_out.insert(temp_out.end(), new_node);
-    }
 
-    if (temp_out.size() >= second_stage_output.size()) {
-      break;
-    }
-    second_stage_output.swap(temp_out);
-    if (fused == false) {
-      break;
-    }
-  }
+  // stage 2
+  auto second_stage_output = NodeMergeWithNode(first_stage_output);
 
   if (second_stage_output.size() == 1) {
     return second_stage_output;
@@ -663,6 +755,49 @@ std::vector<GroupClusterNode> GroupSplit(cinn::dialect::GroupOp group_op) {
   return sorted_out;
 }
 
+std::vector<::pir::Operation*> SortByOriginalOrderAndUniq(
+    cinn::dialect::GroupOp group_op,
+    const std::vector<::pir::Operation*>& ops) {
+  size_t index = 0;
+  std::unordered_map<pir::Operation*, size_t> op2order_value;
+
+  for (auto op : group_op.GetOperators()) {
+    op2order_value[op] = index++;
+  }
+
+  std::vector<pir::Operation*> tmp_ops(ops);
+  std::sort(tmp_ops.begin(),
+            tmp_ops.end(),
+            [&op2order_value](pir::Operation* a, pir::Operation* b) {
+              return op2order_value.at(a) < op2order_value.at(b);
+            });
+
+  std::unique(tmp_ops.begin(), tmp_ops.end());
+
+  return tmp_ops;
+}
+
+std::unordered_map<::pir::Value, size_t> BuildValueOrderByYieldOp(
+    const std::vector<GroupClusterNode>& node_list,
+    cinn::dialect::GroupOp group_op) {
+  std::unordered_map<::pir::Value, size_t> all_output_values;
+  auto yield_op = group_op.GetOperators().back();
+  for (size_t i = 0; i < yield_op->num_operands(); ++i) {
+    size_t id = all_output_values.size();
+    all_output_values.emplace(yield_op->operand_source(i), id);
+  }
+
+  for (size_t i = 0; i < node_list.size(); ++i) {
+    auto node_outside_input = node_list[i].GetOutsideInput();
+    for (const auto& val : node_outside_input) {
+      size_t id = all_output_values.size();
+      all_output_values.emplace(val, id);
+    }
+  }
+
+  return all_output_values;
+}
+
 }  // namespace
 
 class CinnGroupClusterPattern
@@ -675,68 +810,34 @@ class CinnGroupClusterPattern
     ::pir::IrMapping ir_mapping;
 
     auto group_outside_input = GetListOutsideInput(group_op.GetOperators());
+    // insert initial input to ir mapping
     for (auto val : group_outside_input) {
       ir_mapping.Add(val, val);
     }
 
     auto split_res = GroupSplit(group_op);
-    // need sort split res
-
-    std::unordered_set<::pir::Value> all_ouput_values;
-    for (auto& node : split_res) {
-      auto node_outside_input = node.GetOutsideInput();
-      all_ouput_values.insert(node_outside_input.begin(),
-                              node_outside_input.end());
-    }
-
-    size_t index = 0;
-    std::unordered_map<pir::Operation*, size_t> op2id;
-
-    for (auto op1 : group_op.GetOperators()) {
-      op2id[op1] = index++;
-    }
 
-    auto yield_op = group_op.GetOperators().back();
-    for (size_t i = 0; i < yield_op->num_operands(); ++i) {
-      all_ouput_values.insert(yield_op->operand_source(i));
-    }
+    auto all_output_values = BuildValueOrderByYieldOp(split_res, group_op);
 
     for (auto& node : split_res) {
-      node.GenerateOutputValue(all_ouput_values);
-      std::vector<pir::Operation*> tmp_ops(node.ops.begin(), node.ops.end());
-      std::sort(tmp_ops.begin(),
-                tmp_ops.end(),
-                [&op2id](pir::Operation* a, pir::Operation* b) {
-                  return op2id.at(a) < op2id.at(b);
-                });
-
-      std::unique(tmp_ops.begin(), tmp_ops.end());
-
-      auto node_outside_input = node.GetOutsideInput();
-
-      auto insert_point =
-          ReplaceWithGroupOp(&rewriter, tmp_ops, node, &ir_mapping);
-
-      for (size_t i = 0; i < node.output_value.size(); ++i) {
-        ir_mapping.Add(node.output_value[i], insert_point->result(i));
-      }
-
-      std::unordered_set<::pir::Value> local_outs(node.output_value.begin(),
-                                                  node.output_value.end());
+      auto output_values = GenerateOutputValue(node.ops, all_output_values);
+      auto uniq_ops = SortByOriginalOrderAndUniq(group_op, node.ops);
 
-      int local_index = 0;
+      auto new_group_op = ReplaceWithGroupOp(
+          &rewriter, uniq_ops, node, output_values, &ir_mapping);
 
-      std::unordered_map<::pir::Value, size_t> value_order;
-      for (size_t i = 0; i < yield_op->num_operands(); ++i) {
-        value_order[yield_op->operand_source(i)] = i;
+      // update ir mapping
+      for (size_t i = 0; i < output_values.size(); ++i) {
+        ir_mapping.Add(output_values[i], new_group_op->result(i));
       }
 
-      for (size_t i = 0; i < node.output_value.size(); ++i) {
-        if (value_order.count(node.output_value[i])) {
-          // replace
-          rewriter.ReplaceAllUsesWith(
-              group_op.result(value_order.at(node.output_value[i])),
-              insert_point->result(i));
+      for (size_t i = 0; i < output_values.size(); ++i) {
+        auto find_it = all_output_values.find(output_values[i]);
+        if ((find_it != all_output_values.end()) &&
+            (find_it->second < group_op->num_results())) {
+          // id < num_results means yiled input
+          rewriter.ReplaceAllUsesWith(group_op.result(find_it->second),
+                                      new_group_op->result(i));
         }
       }
     }
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.h b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.h
index b4e6cf7d511cd..2350244fdfe38 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.h
@@ -20,6 +20,9 @@ namespace cinn {
 namespace dialect {
 namespace ir {
 
+// Split One GroupOp to multi small GroupOp
+// Each small GroupOp can Generate one kernel by CINN backend
+
 IR_API std::unique_ptr<pir::Pass> CreateCinnGroupClusterPass();
 
 }  // namespace ir
diff --git a/paddle/cinn/hlir/framework/pir/utils.h b/paddle/cinn/hlir/framework/pir/utils.h
index 56cbf2c539648..225f16f5caad2 100644
--- a/paddle/cinn/hlir/framework/pir/utils.h
+++ b/paddle/cinn/hlir/framework/pir/utils.h
@@ -113,8 +113,14 @@ class PrettyNamer {
   ::cinn::common::NameGenerator name_generator_;
 };
 
+enum class ScheduleAlignType : int {
+  kNone = 0,       //! No need to align
+  kBroadcast = 1,  //! Using Broadcast schedule to align
+};
+
 struct ScheduleInfoNode {
-  std::string type;
+  // TOOD(phlrain): update align type by new loop alignment
+  ScheduleAlignType type{ScheduleAlignType::kNone};
 
   std::vector<int64_t> axis_info;
   std::vector<int64_t> factor_info;
@@ -122,7 +128,7 @@ struct ScheduleInfoNode {
   std::string DebugStr() {
     std::stringstream ss;
 
-    ss << "type  " << type << "| axis info ";
+    ss << "type  " << static_cast<int>(type) << "| axis info ";
     for (auto d : axis_info) {
       ss << " " << d;
     }
diff --git a/test/cpp/pir/cinn/ir_op_cluster_test.cc b/test/cpp/pir/cinn/ir_op_cluster_test.cc
index dbca4fae66ebd..5fac91e0c2f48 100644
--- a/test/cpp/pir/cinn/ir_op_cluster_test.cc
+++ b/test/cpp/pir/cinn/ir_op_cluster_test.cc
@@ -623,5 +623,5 @@ TEST(IROpFusionPass, layer_norm2) {
   CHECK_EQ(pm.Run(&program), true);
 
   // TODO(phlrain): need update same as 4u
-  ASSERT_EQ(program.block()->size(), 11u);
+  ASSERT_EQ(program.block()->size(), 10u);
 }

From 84fa05349c09700bb66b7686e4ffb63bbda6dfdc Mon Sep 17 00:00:00 2001
From: Jia Wenxuan <64853160+JiaWenxuan@users.noreply.github.com>
Date: Fri, 23 Feb 2024 11:05:09 +0800
Subject: [PATCH 13/82] Substitute dim expr (#61888)

* add SubstituteDimExprBasedOnConstraintPass

* fixed the unittest file

* fix some bugs

* fix some bugs

* fix some bugs

* fix some bugs
---
 paddle/cinn/common/union_find.h               |  40 ++++
 ...tute_dim_expr_based_on_constraints_pass.cc | 183 ++++++++++++++++++
 ...itute_dim_expr_based_on_constraints_pass.h |  28 +++
 paddle/fluid/pybind/pir.cc                    |   3 +
 ...substitute_dim_expr_based_on_constraint.py |  85 ++++++++
 5 files changed, 339 insertions(+)
 create mode 100644 paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
 create mode 100644 paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h
 create mode 100644 test/ir/pir/cinn/symbolic/test_substitute_dim_expr_based_on_constraint.py

diff --git a/paddle/cinn/common/union_find.h b/paddle/cinn/common/union_find.h
index 18a2ee2bf69ae..a88f52dafe515 100644
--- a/paddle/cinn/common/union_find.h
+++ b/paddle/cinn/common/union_find.h
@@ -21,6 +21,7 @@
 #include <map>
 #include <string>
 #include <tuple>
+#include <unordered_map>
 #include <vector>
 
 #include "paddle/cinn/common/object.h"
@@ -97,5 +98,44 @@ struct UnionFind {
   std::vector<cinn::common::Shared<UnionFindNode>> nodes;
 };
 
+template <typename T>
+class UnionFindSet {
+ public:
+  T Find(const T& x) {
+    if (parent_.find(x) == parent_.end()) {
+      return x;
+    }
+    if (parent_[x] != x) {
+      parent_[x] = Find(parent_[x]);
+    }
+    return parent_[x];
+  }
+
+  void Union(const T& p, const T& q) {
+    if (parent_.find(p) == parent_.end()) {
+      parent_[p] = p;
+    }
+    if (parent_.find(q) == parent_.end()) {
+      parent_[q] = q;
+    }
+    parent_[Find(q)] = Find(p);
+  }
+
+  std::vector<std::vector<T>> Clusters() const {
+    std::unordered_map<T, std::vector<T>> clusters_map;
+    for (auto it = parent_.begin(); it != parent_.end(); it++) {
+      clusters_map[it->second].emplace_back(it->first);
+    }
+    std::vector<std::vector<T>> clusters;
+    for (auto it = clusters_map.begin(); it != clusters_map.end(); it++) {
+      clusters.emplace_back(it->second);
+    }
+    return clusters;
+  }
+
+ private:
+  std::unordered_map<T, T> parent_;
+};
+
 }  // namespace common
 }  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
new file mode 100644
index 0000000000000..68372afa3e9ca
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h"
+
+#include "paddle/cinn/common/dim_expr_util.h"
+#include "paddle/cinn/common/union_find.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+namespace {
+
+template <typename DoEachT>
+void VisitEachOp(pir::ModuleOp module_op, const DoEachT& DoEach) {
+  for (uint32_t i = 0; i < module_op->num_regions(); i++) {
+    for (pir::Block& block : module_op->region(i)) {
+      for (pir::Operation& op : block) {
+        DoEach(op);
+      }
+    }
+  }
+}
+
+template <typename DoEachT>
+void VisitEachValue(const pir::Operation& op, const DoEachT& DoEach) {
+  for (std::size_t i = 0; i < op.num_operands(); ++i) {
+    DoEach(op.operand_source(i));
+  }
+  for (std::size_t i = 0; i < op.num_results(); ++i) {
+    DoEach(op.result(i));
+  }
+}
+
+symbol::TensorShapeOrDataDimExprs SubstituteTensorShapeOrData(
+    const symbol::TensorShapeOrDataDimExprs& shape_or_data,
+    const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
+        substitution_pattern) {
+  auto SubstituteOneDimExpr =
+      [](const std::vector<symbol::DimExpr>& original_dim_expr,
+         const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
+             substitution_pattern) -> std::vector<symbol::DimExpr> {
+    std::vector<symbol::DimExpr> substituted_dim_expr{};
+    for (const symbol::DimExpr& dim_expr : original_dim_expr) {
+      substituted_dim_expr.push_back(
+          cinn::common::SubstituteDimExpr(dim_expr, substitution_pattern));
+    }
+    return substituted_dim_expr;
+  };
+
+  std::vector<symbol::DimExpr> substituted_shape =
+      SubstituteOneDimExpr(shape_or_data.shape(), substitution_pattern);
+  if (!shape_or_data.data().has_value()) {
+    return symbol::ShapeOrData<symbol::DimExpr>(substituted_shape);
+  } else {
+    std::vector<symbol::DimExpr> substituted_data = SubstituteOneDimExpr(
+        shape_or_data.data().value(), substitution_pattern);
+    return symbol::ShapeOrData<symbol::DimExpr>(substituted_shape,
+                                                substituted_data);
+  }
+}
+
+symbol::ShapeOrDataDimExprs SubstituteShapeOrData(
+    const symbol::ShapeOrDataDimExprs& shape_or_data,
+    const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
+        substitution_pattern) {
+  auto lambdas = symbol::Overloaded{
+      [&](const symbol::TensorShapeOrDataDimExprs& tensor_shape_or_data) {
+        return symbol::ShapeOrDataDimExprs(SubstituteTensorShapeOrData(
+            tensor_shape_or_data, substitution_pattern));
+      },
+      [&](const symbol::TensorListShapeOrDataDimExprs& tensor_list) {
+        symbol::TensorListShapeOrDataDimExprs substituted_tensor_list;
+        for (symbol::TensorShapeOrDataDimExprs tensor_shape_or_data :
+             tensor_list) {
+          substituted_tensor_list.push_back(SubstituteTensorShapeOrData(
+              tensor_shape_or_data, substitution_pattern));
+        }
+        return symbol::ShapeOrDataDimExprs(substituted_tensor_list);
+      }};
+  return std::visit(lambdas, shape_or_data.variant());
+}
+
+std::unordered_map<symbol::DimExpr, symbol::DimExpr> GetDimExprSubstitution(
+    pir::ShapeConstraintIRAnalysis* shape_analysis) {
+  const std::vector<symbol::DimExprConstraint>& dim_expr_constraints =
+      shape_analysis->CreateDimExprBuilder().constraints();
+  const cinn::common::UnionFindSet<symbol::DimExpr>& union_find_set = [&]() {
+    cinn::common::UnionFindSet<symbol::DimExpr> union_find_set;
+    for (const auto& constraint : dim_expr_constraints) {
+      CHECK(std::holds_alternative<symbol::Equal<symbol::DimExpr>>(constraint))
+          << "The DimExprConstraint type is no Equal<DimExpr>, this part is to "
+             "be completed.";
+      const auto& data =
+          std::get<symbol::Equal<symbol::DimExpr>>(constraint).data;
+      union_find_set.Union(data->lhs, data->rhs);
+    }
+    return union_find_set;
+  }();
+
+  const std::vector<std::vector<symbol::DimExpr>>& dim_expr_clusters =
+      union_find_set.Clusters();
+  std::unordered_map<symbol::DimExpr, symbol::DimExpr> substitution_pattern;
+  for (const auto& dim_expr_cluster : dim_expr_clusters) {
+    CHECK(!dim_expr_cluster.empty());
+    auto dim_expr_root = dim_expr_cluster[0];
+    for (const auto& dim_expr : dim_expr_cluster) {
+      if (std::holds_alternative<std::int64_t>(dim_expr)) {
+        dim_expr_root = dim_expr;
+        break;
+      }
+    }
+    for (const auto& dim_expr : dim_expr_cluster) {
+      if (dim_expr != dim_expr_root) {
+        substitution_pattern[dim_expr] = dim_expr_root;
+      }
+    }
+  }
+  return substitution_pattern;
+}
+
+void SubstituteDimExprBasedOnConstraints(pir::ModuleOp module_op) {
+  VLOG(4) << "SubstituteDimExprBasedOnConstraints start";
+  pir::ShapeConstraintIRAnalysis shape_analysis =
+      pir::ShapeAnalysisManager::Instance().Get(module_op.program());
+  const std::unordered_map<symbol::DimExpr, symbol::DimExpr>&
+      substitution_pattern = GetDimExprSubstitution(&shape_analysis);
+  VisitEachOp(module_op, [&](pir::Operation& op) {
+    VisitEachValue(op, [&](pir::Value value) {
+      if (!shape_analysis.HasShapeOrDataForValue(value)) {
+        VLOG(4) << "Can not find ShapeOrData for value of op(" << op.name()
+                << ") in shape_analysis";
+      } else {
+        const symbol::ShapeOrDataDimExprs& origin_shape_or_data =
+            shape_analysis.GetShapeOrDataForValue(value);
+        const symbol::ShapeOrDataDimExprs& substituted_shape_or_data =
+            SubstituteShapeOrData(origin_shape_or_data, substitution_pattern);
+        shape_analysis.SetShapeOrDataForValue(value, substituted_shape_or_data);
+      }
+    });
+    // TODO(JiaWenxuan): substitute the attribute "sym_shape_str" of the op
+  });
+  VLOG(4) << "SubstituteDimExprBasedOnConstraints end";
+}
+
+class SubstituteDimExprBasedOnConstraintsPass : public pir::Pass {
+ public:
+  SubstituteDimExprBasedOnConstraintsPass()
+      : pir::Pass("substitute_dim_expr_based_on_constraints_pass", 1) {}
+
+  void Run(pir::Operation* op) override {
+    pir::ModuleOp module_op = op->dyn_cast<pir::ModuleOp>();
+    SubstituteDimExprBasedOnConstraints(module_op);
+  }
+
+  bool CanApplyOn(pir::Operation* op) const override {
+    return op->isa<pir::ModuleOp>() && op->num_regions() > 0;
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<::pir::Pass> CreateSubstituteDimExprBasedOnConstraintsPass() {
+  return std::make_unique<SubstituteDimExprBasedOnConstraintsPass>();
+}
+
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h
new file mode 100644
index 0000000000000..30c0dd7b6a7b6
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/include/pass/pass.h"
+
+namespace cinn {
+namespace dialect {
+namespace ir {
+
+// This is a helper pass for substituting DimExpr based on the
+// constraints symbol::Equal<symbol::DimExpr>.
+std::unique_ptr<::pir::Pass> CreateSubstituteDimExprBasedOnConstraintsPass();
+}  // namespace ir
+}  // namespace dialect
+}  // namespace cinn
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 99e1b624edefa..3a0de137173a7 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -95,6 +95,7 @@
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/divide_group_op_to_fusion_op_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/move_generate_shape_ops_to_prologue_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/simplify_dim_expr_pass.h"
+#include "paddle/cinn/hlir/dialect/operator/transforms/group_merge/substitute_dim_expr_based_on_constraints_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/insert_broadcast_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/lower_cinn_fusion_op_pass.h"
 #include "paddle/cinn/hlir/dialect/operator/transforms/pd_to_cinn_pass.h"
@@ -1582,6 +1583,8 @@ void AddCinnPass(std::shared_ptr<PassManager> &pass_manager,  // NOLINT
   if (has_dynamic_shape) {
     pass_manager->AddPass(pir::CreateShapeOptimizationPass());
     pass_manager->AddPass(cinn::dialect::ir::CreateSimplifyDimExprPass());
+    pass_manager->AddPass(
+        cinn::dialect::ir::CreateSubstituteDimExprBasedOnConstraintsPass());
     pass_manager->AddPass(cinn::dialect::ir::CreateInsertBroadcastPass());
     pass_manager->AddPass(pir::CreateShapeOptimizationPass());
     pass_manager->AddPass(
diff --git a/test/ir/pir/cinn/symbolic/test_substitute_dim_expr_based_on_constraint.py b/test/ir/pir/cinn/symbolic/test_substitute_dim_expr_based_on_constraint.py
new file mode 100644
index 0000000000000..a9119455e94fd
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_substitute_dim_expr_based_on_constraint.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+import utils
+
+
+class TestSubstituteDimExprNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y1, y2):
+        z1 = paddle.concat([y1, x], 0)
+        z2 = paddle.concat([y1, y2], 0)
+        out = z1 + z2
+        return out
+
+
+class TestSubstituteDimExprBasedOnConstraint(unittest.TestCase):
+    """
+    Test Pir API + @to_static + CINN.
+    """
+
+    def setUp(self):
+        paddle.seed(2022)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.shapex = [32, 128]
+        self.x = paddle.randn(self.shapex, dtype="float32")
+        self.x.stop_gradient = False
+        self.shapey = [32, 128]
+        self.y1 = paddle.randn(self.shapey, dtype="float32")
+        self.y1.stop_gradient = False
+        self.y2 = paddle.randn(self.shapey, dtype="float32")
+        self.y2.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = TestSubstituteDimExprNet()
+        input_spec = [
+            InputSpec(shape=[32, 128], dtype="float32"),
+            InputSpec(shape=[32, None], dtype="float32"),
+            InputSpec(shape=[32, None], dtype="float32"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x, self.y1, self.y2)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        cinn_out = self.eval(use_cinn=True)
+        np.testing.assert_allclose(
+            cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7b756babae75c51f6d3b479b53c1f0e4169e11c6 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 12:03:49 +0800
Subject: [PATCH 14/82] Update operator.cc (#61999)

---
 paddle/fluid/framework/operator.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 22c9532880e48..c9d7af6a44cea 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2936,9 +2936,9 @@ void OperatorWithKernel::ParseMultiInputDataType(
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
-  proto::VarType::Type dafault_data_type =
+  proto::VarType::Type default_data_type =
       static_cast<proto::VarType::Type>(-1);
-  proto::VarType::Type data_type = dafault_data_type;
+  proto::VarType::Type data_type = default_data_type;
 
   for (auto* name : ctx.InNameList()) {
     if (ctx.InputSize(*name) == 1UL) {
@@ -2949,7 +2949,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
   }
   PADDLE_ENFORCE_NE(
       data_type,
-      dafault_data_type,
+      default_data_type,
       platform::errors::NotFound(
           "DataType should be indicated by input Variable at %s.", Type()));
   return data_type;
@@ -2957,9 +2957,9 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
 
 proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
     const ExecutionContext& ctx, const std::string& name) const {
-  proto::VarType::Type dafault_data_type =
+  proto::VarType::Type default_data_type =
       static_cast<proto::VarType::Type>(-1);
-  proto::VarType::Type data_type = dafault_data_type;
+  proto::VarType::Type data_type = default_data_type;
   if (ctx.InputSize(name) == 1UL) {
     ParseInputDataType(ctx.InputVar(name), name, &data_type);
   } else {
@@ -2967,7 +2967,7 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
   }
   PADDLE_ENFORCE_NE(
       data_type,
-      dafault_data_type,
+      default_data_type,
       platform::errors::InvalidArgument(
           "The Input Variable(%s) of (%s) Operator used to determine kernel "
           "data type is empty or not phi::DenseTensor or SelectedRows or "

From 9d10d5ce2b07791a33b79585c4f593fd6814e67e Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 12:07:40 +0800
Subject: [PATCH 15/82]  Fix EmbeddingInferSpmdUnspportVocabParallel(#61928)

---
 paddle/phi/api/yaml/legacy_ops.yaml          | 2 +-
 paddle/phi/infermeta/spmd_rules/embedding.cc | 8 ++++----
 paddle/phi/infermeta/spmd_rules/embedding.h  | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 39d1fda93c48b..9b1d862180903 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -384,7 +384,7 @@
   infer_meta :
     func : EmbeddingInferMeta
     param : [x, weight, padding_idx]
-    spmd_rule: EmbeddingInferSpmdUnspportVocabParallel
+    spmd_rule: EmbeddingInferSpmdUnsupportVocabParallel
   kernel :
     func : embedding {dense, dense -> dense}
            sparse_weight_embedding {dense, selected_rows -> dense}
diff --git a/paddle/phi/infermeta/spmd_rules/embedding.cc b/paddle/phi/infermeta/spmd_rules/embedding.cc
index 8b8a309f66292..27284ae1a82e0 100644
--- a/paddle/phi/infermeta/spmd_rules/embedding.cc
+++ b/paddle/phi/infermeta/spmd_rules/embedding.cc
@@ -28,10 +28,10 @@ namespace distributed {
 
 using phi::distributed::auto_parallel::str_join;
 
-SpmdInfo EmbeddingInferSpmdUnspportVocabParallel(const DistMetaTensor& x,
-                                                 const DistMetaTensor& weight,
-                                                 int padding_idx,
-                                                 bool sparse) {
+SpmdInfo EmbeddingInferSpmdUnsupportVocabParallel(const DistMetaTensor& x,
+                                                  const DistMetaTensor& weight,
+                                                  int padding_idx,
+                                                  bool sparse) {
   DistMetaTensor w(weight.dims(), weight.dist_attr());
   if (weight.dist_attr().dims_mapping()[0] >= 0) {
     auto w_dims_mapping = weight.dist_attr().dims_mapping();
diff --git a/paddle/phi/infermeta/spmd_rules/embedding.h b/paddle/phi/infermeta/spmd_rules/embedding.h
index cc168c6fca86e..47b56048892f0 100644
--- a/paddle/phi/infermeta/spmd_rules/embedding.h
+++ b/paddle/phi/infermeta/spmd_rules/embedding.h
@@ -39,10 +39,10 @@ SpmdInfo EmbeddingInferSpmd(const DistMetaTensor& x,
 /// is used in  static graph, but `embedding` used in egaer graph is not
 /// supported. So we need two propagation rules for `c_embedding` and
 /// `embedding`.
-SpmdInfo EmbeddingInferSpmdUnspportVocabParallel(const DistMetaTensor& x,
-                                                 const DistMetaTensor& weight,
-                                                 int padding_idx,
-                                                 bool sparse = false);
+SpmdInfo EmbeddingInferSpmdUnsupportVocabParallel(const DistMetaTensor& x,
+                                                  const DistMetaTensor& weight,
+                                                  int padding_idx,
+                                                  bool sparse = false);
 
 SpmdInfo EmbeddingInferSpmdReverse(const DistMetaTensor& x,
                                    const DistMetaTensor& weight,

From c4dbcc81fc44b6d8169676c527e8984ec7dbcdc6 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 12:08:58 +0800
Subject: [PATCH 16/82] Fix grad node interface name (#61958)

* Fix

* ci

* ci
---
 .../eager_manual/forwards/add_n_fwd_func.cc   |   2 +-
 .../forwards/conv2d_fwd_function.cc           |  16 +-
 .../forwards/multiply_fwd_func.cc             |  28 ++--
 .../eager_manual/forwards/reshard_fwd_func.cc |   2 +-
 .../forwards/sync_batch_norm_fwd_func.cc      |  48 +++---
 .../manual/eager_manual/nodes/conv2d_nodes.cc |  18 +-
 .../eager_manual/nodes/multiply_node.cc       |   8 +-
 .../api/manual/eager_manual/nodes/nodes.h     | 108 ++++++------
 .../forwards/fused_attention_fwd_func.cc      |  56 +++----
 ...as_dropout_residual_layer_norm_fwd_func.cc |  19 ++-
 .../forwards/fused_feedforward_fwd_func.cc    |  38 ++---
 .../forwards/fused_gate_attention_fwd_func.cc |  40 ++---
 .../forwards/fused_gemm_epilogue_fwd_func.cc  |   4 +-
 .../api/manual/fluid_manual/nodes/nodes.h     | 156 +++++++++---------
 .../auto_code_generator/eager_generator.cc    |   4 +-
 .../generator/eager_gen.py                    |  26 +--
 paddle/fluid/pybind/eager_method.cc           |  14 +-
 17 files changed, 294 insertions(+), 293 deletions(-)

diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
index add0359ccf25d..d27ca1d242953 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -94,7 +94,7 @@ paddle::Tensor add_n_ad_func(const std::vector<paddle::Tensor>& x) {
     // SetAttributes if needed
 
     // Set TensorWrappers for Forward Inputs if needed
-    grad_node->SetTensorWrapperx(x);
+    grad_node->SetTensorWrapper_x(x);
     // SetGradOutMeta & SetEdges
     grad_node->SetGradOutMeta(x, 0);
     // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
index 33e9393a615bc..7cf3ee807b685 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -146,15 +146,15 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input,
     }
 
     // SetAttributes if needed
-    grad_node->SetAttributestrides(strides);
-    grad_node->SetAttributepaddings(paddings);
-    grad_node->SetAttributepadding_algorithm(padding_algorithm);
-    grad_node->SetAttributegroups(groups);
-    grad_node->SetAttributedilations(dilations);
-    grad_node->SetAttributedata_format(data_format);
+    grad_node->SetAttribute_strides(strides);
+    grad_node->SetAttribute_paddings(paddings);
+    grad_node->SetAttribute_padding_algorithm(padding_algorithm);
+    grad_node->SetAttribute_groups(groups);
+    grad_node->SetAttribute_dilations(dilations);
+    grad_node->SetAttribute_data_format(data_format);
     // Set TensorWrappers for Forward Inputs if needed
-    grad_node->SetTensorWrapperinput(input);
-    grad_node->SetTensorWrapperfilter(filter);
+    grad_node->SetTensorWrapper_input(input);
+    grad_node->SetTensorWrapper_filter(filter);
     // SetGradOutMeta & SetEdges
     grad_node->SetGradOutMeta(input, 0);
     grad_node->SetGradOutMeta(filter, 1);
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
index 18e36264ebe6b..856407c58e96c 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
@@ -157,25 +157,25 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
       grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
     }
     // SetAttributes if needed
-    grad_node->SetAttributeaxis(-1);
+    grad_node->SetAttribute_axis(-1);
     if (paddle::platform::is_gpu_place(x.place())) {
       if (x_autograd_meta != nullptr && x_autograd_meta->StopGradient() &&
           y_autograd_meta != nullptr && !y_autograd_meta->StopGradient()) {
-        grad_node->SetTensorWrapperx(x);
-        grad_node->SetTensorWrapperNoNeedBuffery(y);
+        grad_node->SetTensorWrapper_x(x);
+        grad_node->SetTensorWrapperNoNeedBuffer_y(y);
       } else if (x_autograd_meta != nullptr &&
                  !x_autograd_meta->StopGradient() &&
                  y_autograd_meta != nullptr &&
                  y_autograd_meta->StopGradient()) {
-        grad_node->SetTensorWrapperNoNeedBufferx(x);
-        grad_node->SetTensorWrappery(y);
+        grad_node->SetTensorWrapperNoNeedBuffer_x(x);
+        grad_node->SetTensorWrapper_y(y);
       } else {
-        grad_node->SetTensorWrapperx(x);
-        grad_node->SetTensorWrappery(y);
+        grad_node->SetTensorWrapper_x(x);
+        grad_node->SetTensorWrapper_y(y);
       }
     } else {
-      grad_node->SetTensorWrapperx(x);
-      grad_node->SetTensorWrappery(y);
+      grad_node->SetTensorWrapper_x(x);
+      grad_node->SetTensorWrapper_y(y);
     }
     // SetGradOutMeta & SetEdges
     grad_node->SetGradOutMeta(x, 0);
@@ -300,11 +300,11 @@ paddle::Tensor& multiply__ad_func(paddle::Tensor& x,  // NOLINT
       grad_node->SetForwardTrace(egr::Controller::Instance().GetPythonStack());
     }
     // SetAttributes if needed
-    grad_node->SetAttributeaxis(-1);
+    grad_node->SetAttribute_axis(-1);
     // Set TensorWrappers for Forward Inputs if needed
     auto x_clone = paddle::experimental::assign(x);
-    grad_node->SetTensorWrapperx(x_clone);
-    grad_node->SetTensorWrappery(y);
+    grad_node->SetTensorWrapper_x(x_clone);
+    grad_node->SetTensorWrapper_y(y);
   }
 
   // Forward API Call
@@ -505,8 +505,8 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
     // SetAttributes if needed
 
     // Set TensorWrappers for Forward Inputs if needed
-    grad_node->SetTensorWrapperx(x);
-    grad_node->SetTensorWrappery(y);
+    grad_node->SetTensorWrapper_x(x);
+    grad_node->SetTensorWrapper_y(y);
     // SetGradOutMeta & SetEdges
     grad_node->SetGradOutMeta(x, 0);
     grad_node->SetGradOutMeta(y, 1);
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc
index 5ee5d74094538..b227e2a06e68d 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/reshard_fwd_func.cc
@@ -50,7 +50,7 @@ paddle::Tensor reshard_ad_function(
         std::shared_ptr<ReshardGradNode>(new ReshardGradNode(1, 1));  // NOLINT
 
     // Set TensorWrappers for Forward Inputs if needed
-    grad_node->SetTensorWrapperNoNeedBufferInput(input);
+    grad_node->SetTensorWrapperNoNeedBuffer_Input(input);
   }
 
   // Forward API Call
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc
index 654ab2bfd73db..c4e007801c66c 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc
@@ -235,16 +235,16 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
 
     egr::Controller::Instance().PushBackForceSequentialNodes(grad_node.get());
     // SetAttributes if needed
-    grad_node->SetAttributemomentum(momentum);
-    grad_node->SetAttributeepsilon(epsilon);
-    grad_node->SetAttributedata_layout(data_layout);
-    grad_node->SetAttributeis_test(is_test);
-    grad_node->SetAttributeuse_global_stats(use_global_stats);
-    grad_node->SetAttributetrainable_statistics(trainable_statistics);
+    grad_node->SetAttribute_momentum(momentum);
+    grad_node->SetAttribute_epsilon(epsilon);
+    grad_node->SetAttribute_data_layout(data_layout);
+    grad_node->SetAttribute_is_test(is_test);
+    grad_node->SetAttribute_use_global_stats(use_global_stats);
+    grad_node->SetAttribute_trainable_statistics(trainable_statistics);
     // Set TensorWrappers for Forward Inputs if needed
-    grad_node->SetTensorWrapperx(x);
-    grad_node->SetTensorWrapperscale(scale);
-    grad_node->SetTensorWrapperbias(bias);
+    grad_node->SetTensorWrapper_x(x);
+    grad_node->SetTensorWrapper_scale(scale);
+    grad_node->SetTensorWrapper_bias(bias);
     // SetGradOutMeta & SetEdges
     grad_node->SetGradOutMeta(x, 0);
     grad_node->SetGradOutMeta(scale, 3);
@@ -293,9 +293,9 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
     grad_node->SetGradInMeta(saved_variance, 4);
     grad_node->SetGradInMeta(reserve_space, 5);
     // Set TensorWrappers for Forward Outputs if needed
-    grad_node->SetTensorWrappersaved_mean(saved_mean);
-    grad_node->SetTensorWrappersaved_variance(saved_variance);
-    grad_node->SetTensorWrapperreserve_space(reserve_space);
+    grad_node->SetTensorWrapper_saved_mean(saved_mean);
+    grad_node->SetTensorWrapper_saved_variance(saved_variance);
+    grad_node->SetTensorWrapper_reserve_space(reserve_space);
   }
 
   VLOG(4) << "Finish AD API: sync_batch_norm_";
@@ -571,16 +571,16 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
         new SyncBatchNormGradNode(6, 5));
     egr::Controller::Instance().PushBackForceSequentialNodes(grad_node.get());
     // SetAttributes if needed
-    grad_node->SetAttributemomentum(momentum);
-    grad_node->SetAttributeepsilon(epsilon);
-    grad_node->SetAttributedata_layout(data_layout);
-    grad_node->SetAttributeis_test(is_test);
-    grad_node->SetAttributeuse_global_stats(use_global_stats);
-    grad_node->SetAttributetrainable_statistics(trainable_statistics);
+    grad_node->SetAttribute_momentum(momentum);
+    grad_node->SetAttribute_epsilon(epsilon);
+    grad_node->SetAttribute_data_layout(data_layout);
+    grad_node->SetAttribute_is_test(is_test);
+    grad_node->SetAttribute_use_global_stats(use_global_stats);
+    grad_node->SetAttribute_trainable_statistics(trainable_statistics);
     // Set TensorWrappers for Forward Inputs if needed
-    grad_node->SetTensorWrapperx(x);
-    grad_node->SetTensorWrapperscale(scale);
-    grad_node->SetTensorWrapperbias(bias);
+    grad_node->SetTensorWrapper_x(x);
+    grad_node->SetTensorWrapper_scale(scale);
+    grad_node->SetTensorWrapper_bias(bias);
     // SetGradOutMeta & SetEdges
     grad_node->SetGradOutMeta(x, 0);
     grad_node->SetGradOutMeta(scale, 3);
@@ -629,9 +629,9 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
     grad_node->SetGradInMeta(saved_variance, 4);
     grad_node->SetGradInMeta(reserve_space, 5);
     // Set TensorWrappers for Forward Outputs if needed
-    grad_node->SetTensorWrappersaved_mean(saved_mean);
-    grad_node->SetTensorWrappersaved_variance(saved_variance);
-    grad_node->SetTensorWrapperreserve_space(reserve_space);
+    grad_node->SetTensorWrapper_saved_mean(saved_mean);
+    grad_node->SetTensorWrapper_saved_variance(saved_variance);
+    grad_node->SetTensorWrapper_reserve_space(reserve_space);
   }
 
   VLOG(4) << "Finish AD API: sync_batch_norm_";
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
index e1bcc3bc73731..437cce80c919b 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc
@@ -126,16 +126,16 @@ Conv2dGradNodeFinal::operator()(
     auto grad_node = std::shared_ptr<Conv2dDoubleGradNodeFinal>(  // NOLINT
         new Conv2dDoubleGradNodeFinal(2, 3));
     // SetAttributes if needed
-    grad_node->SetAttributestrides(strides);
-    grad_node->SetAttributepaddings(paddings);
-    grad_node->SetAttributepadding_algorithm(padding_algorithm);
-    grad_node->SetAttributegroups(groups);
-    grad_node->SetAttributedilations(dilations);
-    grad_node->SetAttributedata_format(data_format);
+    grad_node->SetAttribute_strides(strides);
+    grad_node->SetAttribute_paddings(paddings);
+    grad_node->SetAttribute_padding_algorithm(padding_algorithm);
+    grad_node->SetAttribute_groups(groups);
+    grad_node->SetAttribute_dilations(dilations);
+    grad_node->SetAttribute_data_format(data_format);
     // Set TensorWrappers for Forward Inputs if needed
-    grad_node->SetTensorWrapperinput(input);
-    grad_node->SetTensorWrapperfilter(filter);
-    grad_node->SetTensorWrappergrad_out(grad_out);
+    grad_node->SetTensorWrapper_input(input);
+    grad_node->SetTensorWrapper_filter(filter);
+    grad_node->SetTensorWrapper_grad_out(grad_out);
     // SetGradOutMeta & SetEdges
     if (grad_filter_autograd_meta) {
       grad_node->SetGradOutMeta(input, 0);
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
index b3e38e066300d..56c1f1e61a7fc 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/multiply_node.cc
@@ -167,11 +167,11 @@ MultiplyGradNode::operator()(
       auto grad_node = std::shared_ptr<MultiplyDoubleGradNode>(  // NOLINT
           new MultiplyDoubleGradNode(2, 3));
       // SetAttributes if needed
-      grad_node->SetAttributeaxis(axis);
+      grad_node->SetAttribute_axis(axis);
       // Set TensorWrappers for Forward Inputs if needed
-      grad_node->SetTensorWrapperx(x);
-      grad_node->SetTensorWrappery(y);
-      grad_node->SetTensorWrappergrad_out(grad_out);
+      grad_node->SetTensorWrapper_x(x);
+      grad_node->SetTensorWrapper_y(y);
+      grad_node->SetTensorWrapper_grad_out(grad_out);
       // SetGradOutMeta & SetEdges
       grad_node->SetGradOutMeta(x, 0);
       grad_node->SetGradOutMeta(y, 1);
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
index bc6d1d9f1a1b6..12274670827f6 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h
@@ -48,28 +48,28 @@ class Conv2dGradNodeFinal : public egr::GradNodeBase {
   }
 
   // SetTensorWrapperX, SetTensorWrapperY, ...
-  void SetTensorWrapperinput(const paddle::Tensor& input) {
+  void SetTensorWrapper_input(const paddle::Tensor& input) {
     input_ = egr::TensorWrapper(input, false);
   }
-  void SetTensorWrapperfilter(const paddle::Tensor& filter) {
+  void SetTensorWrapper_filter(const paddle::Tensor& filter) {
     filter_ = egr::TensorWrapper(filter, false);
   }
 
   // SetAttributes
-  void SetAttributestrides(const std::vector<int>& strides) {
+  void SetAttribute_strides(const std::vector<int>& strides) {
     strides_ = strides;
   }
-  void SetAttributepaddings(const std::vector<int>& paddings) {
+  void SetAttribute_paddings(const std::vector<int>& paddings) {
     paddings_ = paddings;
   }
-  void SetAttributepadding_algorithm(const std::string& padding_algorithm) {
+  void SetAttribute_padding_algorithm(const std::string& padding_algorithm) {
     padding_algorithm_ = padding_algorithm;
   }
-  void SetAttributegroups(const int& groups) { groups_ = groups; }
-  void SetAttributedilations(const std::vector<int>& dilations) {
+  void SetAttribute_groups(const int& groups) { groups_ = groups; }
+  void SetAttribute_dilations(const std::vector<int>& dilations) {
     dilations_ = dilations;
   }
-  void SetAttributedata_format(const std::string& data_format) {
+  void SetAttribute_data_format(const std::string& data_format) {
     data_format_ = data_format;
   }
 
@@ -117,31 +117,31 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase {
   }
 
   // SetTensorWrapperX, SetTensorWrapperY, ...
-  void SetTensorWrapperinput(const paddle::Tensor& input) {
+  void SetTensorWrapper_input(const paddle::Tensor& input) {
     input_ = egr::TensorWrapper(input, false);
   }
-  void SetTensorWrapperfilter(const paddle::Tensor& filter) {
+  void SetTensorWrapper_filter(const paddle::Tensor& filter) {
     filter_ = egr::TensorWrapper(filter, false);
   }
-  void SetTensorWrappergrad_out(const paddle::Tensor& grad_out) {
+  void SetTensorWrapper_grad_out(const paddle::Tensor& grad_out) {
     grad_out_ = egr::TensorWrapper(grad_out, false);
   }
 
   // SetAttributes
-  void SetAttributestrides(const std::vector<int>& strides) {
+  void SetAttribute_strides(const std::vector<int>& strides) {
     strides_ = strides;
   }
-  void SetAttributepaddings(const std::vector<int>& paddings) {
+  void SetAttribute_paddings(const std::vector<int>& paddings) {
     paddings_ = paddings;
   }
-  void SetAttributepadding_algorithm(const std::string& padding_algorithm) {
+  void SetAttribute_padding_algorithm(const std::string& padding_algorithm) {
     padding_algorithm_ = padding_algorithm;
   }
-  void SetAttributegroups(const int& groups) { groups_ = groups; }
-  void SetAttributedilations(const std::vector<int>& dilations) {
+  void SetAttribute_groups(const int& groups) { groups_ = groups; }
+  void SetAttribute_dilations(const std::vector<int>& dilations) {
     dilations_ = dilations;
   }
-  void SetAttributedata_format(const std::string& data_format) {
+  void SetAttribute_data_format(const std::string& data_format) {
     data_format_ = data_format;
   }
 
@@ -190,7 +190,7 @@ class AddNGradNodeFinal : public egr::GradNodeBase {
   }
 
   // SetTensorWrapperX, SetTensorWrapperY, ...
-  void SetTensorWrapperx(const std::vector<paddle::Tensor>& x) {
+  void SetTensorWrapper_x(const std::vector<paddle::Tensor>& x) {
     for (const auto& eager_tensor : x) {
       x_.emplace_back(egr::TensorWrapper(eager_tensor, true));
     }
@@ -233,22 +233,22 @@ class MultiplyGradNode : public egr::GradNodeBase {
   }
 
   // SetTensorWrapperX, SetTensorWrapperY, ...
-  void SetTensorWrapperx(const paddle::Tensor& x) {
+  void SetTensorWrapper_x(const paddle::Tensor& x) {
     x_ = egr::TensorWrapper(x, false);
   }
-  void SetTensorWrappery(const paddle::Tensor& y) {
+  void SetTensorWrapper_y(const paddle::Tensor& y) {
     y_ = egr::TensorWrapper(y, false);
   }
 
-  void SetTensorWrapperNoNeedBufferx(const paddle::Tensor& x) {
+  void SetTensorWrapperNoNeedBuffer_x(const paddle::Tensor& x) {
     x_ = egr::TensorWrapper(x, true);
   }
-  void SetTensorWrapperNoNeedBuffery(const paddle::Tensor& y) {
+  void SetTensorWrapperNoNeedBuffer_y(const paddle::Tensor& y) {
     y_ = egr::TensorWrapper(y, true);
   }
 
   // SetAttributes
-  void SetAttributeaxis(const int& axis) { axis_ = axis; }
+  void SetAttribute_axis(const int& axis) { axis_ = axis; }
 
  private:
   // TensorWrappers
@@ -289,18 +289,18 @@ class MultiplyDoubleGradNode : public egr::GradNodeBase {
   }
 
   // SetTensorWrapperX, SetTensorWrapperY, ...
-  void SetTensorWrapperx(const paddle::Tensor& x) {
+  void SetTensorWrapper_x(const paddle::Tensor& x) {
     x_ = egr::TensorWrapper(x, false);
   }
-  void SetTensorWrappery(const paddle::Tensor& y) {
+  void SetTensorWrapper_y(const paddle::Tensor& y) {
     y_ = egr::TensorWrapper(y, false);
   }
-  void SetTensorWrappergrad_out(const paddle::Tensor& grad_out) {
+  void SetTensorWrapper_grad_out(const paddle::Tensor& grad_out) {
     grad_out_ = egr::TensorWrapper(grad_out, false);
   }
 
   // SetAttributes
-  void SetAttributeaxis(const int& axis) { axis_ = axis; }
+  void SetAttribute_axis(const int& axis) { axis_ = axis; }
 
  private:
   // TensorWrappers
@@ -345,36 +345,36 @@ class SyncBatchNormGradNode : public egr::GradNodeBase {
   }
 
   // SetTensorWrapperX, SetTensorWrapperY, ...
-  void SetTensorWrapperx(const paddle::Tensor& x) {
+  void SetTensorWrapper_x(const paddle::Tensor& x) {
     x_ = egr::TensorWrapper(x, false);
   }
-  void SetTensorWrapperscale(const paddle::Tensor& scale) {
+  void SetTensorWrapper_scale(const paddle::Tensor& scale) {
     scale_ = egr::TensorWrapper(scale, false);
   }
-  void SetTensorWrapperbias(const paddle::Tensor& bias) {
+  void SetTensorWrapper_bias(const paddle::Tensor& bias) {
     bias_ = egr::TensorWrapper(bias, false);
   }
-  void SetTensorWrappersaved_mean(const paddle::Tensor& saved_mean) {
+  void SetTensorWrapper_saved_mean(const paddle::Tensor& saved_mean) {
     saved_mean_ = egr::TensorWrapper(saved_mean, false);
   }
-  void SetTensorWrappersaved_variance(const paddle::Tensor& saved_variance) {
+  void SetTensorWrapper_saved_variance(const paddle::Tensor& saved_variance) {
     saved_variance_ = egr::TensorWrapper(saved_variance, false);
   }
-  void SetTensorWrapperreserve_space(const paddle::Tensor& reserve_space) {
+  void SetTensorWrapper_reserve_space(const paddle::Tensor& reserve_space) {
     reserve_space_ = egr::TensorWrapper(reserve_space, false);
   }
 
   // SetAttributes
-  void SetAttributemomentum(const float& momentum) { momentum_ = momentum; }
-  void SetAttributeepsilon(const float& epsilon) { epsilon_ = epsilon; }
-  void SetAttributedata_layout(const std::string& data_layout) {
+  void SetAttribute_momentum(const float& momentum) { momentum_ = momentum; }
+  void SetAttribute_epsilon(const float& epsilon) { epsilon_ = epsilon; }
+  void SetAttribute_data_layout(const std::string& data_layout) {
     data_layout_ = data_layout;
   }
-  void SetAttributeis_test(const bool& is_test) { is_test_ = is_test; }
-  void SetAttributeuse_global_stats(const bool& use_global_stats) {
+  void SetAttribute_is_test(const bool& is_test) { is_test_ = is_test; }
+  void SetAttribute_use_global_stats(const bool& use_global_stats) {
     use_global_stats_ = use_global_stats;
   }
-  void SetAttributetrainable_statistics(const bool& trainable_statistics) {
+  void SetAttribute_trainable_statistics(const bool& trainable_statistics) {
     trainable_statistics_ = trainable_statistics;
   }
 
@@ -434,7 +434,7 @@ class ReshardGradNode : public egr::GradNodeBase {
 
   // SetTensorWrapperX
   // Only input's meta is needed.
-  void SetTensorWrapperNoNeedBufferInput(const paddle::Tensor& input) {
+  void SetTensorWrapperNoNeedBuffer_Input(const paddle::Tensor& input) {
     input_ = egr::TensorWrapper(input, true);
   }
 
@@ -477,36 +477,36 @@ class SyncBatchNormGradNode : public egr::GradNodeBase {
   }
 
   // SetTensorWrapperX, SetTensorWrapperY, ...
-  void SetTensorWrapperx(const paddle::Tensor& x) {
+  void SetTensorWrapper_x(const paddle::Tensor& x) {
     x_ = egr::TensorWrapper(x, false);
   }
-  void SetTensorWrapperscale(const paddle::Tensor& scale) {
+  void SetTensorWrapper_scale(const paddle::Tensor& scale) {
     scale_ = egr::TensorWrapper(scale, false);
   }
-  void SetTensorWrapperbias(const paddle::Tensor& bias) {
+  void SetTensorWrapper_bias(const paddle::Tensor& bias) {
     bias_ = egr::TensorWrapper(bias, false);
   }
-  void SetTensorWrappersaved_mean(const paddle::Tensor& saved_mean) {
+  void SetTensorWrapper_saved_mean(const paddle::Tensor& saved_mean) {
     saved_mean_ = egr::TensorWrapper(saved_mean, false);
   }
-  void SetTensorWrappersaved_variance(const paddle::Tensor& saved_variance) {
+  void SetTensorWrapper_saved_variance(const paddle::Tensor& saved_variance) {
     saved_variance_ = egr::TensorWrapper(saved_variance, false);
   }
-  void SetTensorWrapperreserve_space(const paddle::Tensor& reserve_space) {
+  void SetTensorWrapper_reserve_space(const paddle::Tensor& reserve_space) {
     reserve_space_ = egr::TensorWrapper(reserve_space, false);
   }
 
   // SetAttributes
-  void SetAttributemomentum(const float& momentum) { momentum_ = momentum; }
-  void SetAttributeepsilon(const float& epsilon) { epsilon_ = epsilon; }
-  void SetAttributedata_layout(const std::string& data_layout) {
+  void SetAttribute_momentum(const float& momentum) { momentum_ = momentum; }
+  void SetAttribute_epsilon(const float& epsilon) { epsilon_ = epsilon; }
+  void SetAttribute_data_layout(const std::string& data_layout) {
     data_layout_ = data_layout;
   }
-  void SetAttributeis_test(const bool& is_test) { is_test_ = is_test; }
-  void SetAttributeuse_global_stats(const bool& use_global_stats) {
+  void SetAttribute_is_test(const bool& is_test) { is_test_ = is_test; }
+  void SetAttribute_use_global_stats(const bool& use_global_stats) {
     use_global_stats_ = use_global_stats;
   }
-  void SetAttributetrainable_statistics(const bool& trainable_statistics) {
+  void SetAttribute_trainable_statistics(const bool& trainable_statistics) {
     trainable_statistics_ = trainable_statistics;
   }
 
@@ -557,10 +557,10 @@ class MultiplyGradNode : public egr::GradNodeBase {
   }
 
   // SetTensorWrapperX, SetTensorWrapperY, ...
-  void SetTensorWrapperx(const paddle::Tensor& x) {
+  void SetTensorWrapper_x(const paddle::Tensor& x) {
     x_ = egr::TensorWrapper(x, false);
   }
-  void SetTensorWrappery(const paddle::Tensor& y) {
+  void SetTensorWrapper_y(const paddle::Tensor& y) {
     y_ = egr::TensorWrapper(y, false);
   }
 
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
index f3612c2830dd0..6130b79059f65 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
@@ -403,27 +403,27 @@ fused_attention_dygraph_function(
       grad_node->SetAttrMap(std::move(attrs));
       grad_node->SetDefaultAttrMap(std::move(default_attrs));
 
-      grad_node->SetTensorWrapperX(X);
-      grad_node->SetTensorWrapperQKVW(QKVW);
-      grad_node->SetTensorWrapperOutLinearW(OutLinearW);
-      grad_node->SetTensorWrapperQKVOut(QKVOut);
-      grad_node->SetTensorWrapperTransposeOut2(TransposeOut2);
-      grad_node->SetTensorWrapperQKOut(QKOut);
-      grad_node->SetTensorWrapperQKTVOut(QKTVOut);
-      grad_node->SetTensorWrapperSoftmaxOut(SoftmaxOut);
-      grad_node->SetTensorWrapperAttnDropoutMaskOut(AttnDropoutMaskOut);
-      grad_node->SetTensorWrapperAttnDropoutOut(AttnDropoutOut);
-      grad_node->SetTensorWrapperFMHAOut(FMHAOut);
-      grad_node->SetTensorWrapperOutLinearOut(OutLinearOut);
-      grad_node->SetTensorWrapperDropoutMaskOut(DropoutMaskOut);
+      grad_node->SetTensorWrapper_X(X);
+      grad_node->SetTensorWrapper_QKVW(QKVW);
+      grad_node->SetTensorWrapper_OutLinearW(OutLinearW);
+      grad_node->SetTensorWrapper_QKVOut(QKVOut);
+      grad_node->SetTensorWrapper_TransposeOut2(TransposeOut2);
+      grad_node->SetTensorWrapper_QKOut(QKOut);
+      grad_node->SetTensorWrapper_QKTVOut(QKTVOut);
+      grad_node->SetTensorWrapper_SoftmaxOut(SoftmaxOut);
+      grad_node->SetTensorWrapper_AttnDropoutMaskOut(AttnDropoutMaskOut);
+      grad_node->SetTensorWrapper_AttnDropoutOut(AttnDropoutOut);
+      grad_node->SetTensorWrapper_FMHAOut(FMHAOut);
+      grad_node->SetTensorWrapper_OutLinearOut(OutLinearOut);
+      grad_node->SetTensorWrapper_DropoutMaskOut(DropoutMaskOut);
 
       grad_node->SetGradOutMeta(X, 0);
       grad_node->SetGradOutMeta(QKVW, 3);
       grad_node->SetGradOutMeta(OutLinearW, 7);
 
       if (QKVBias.initialized()) {
-        grad_node->SetTensorWrapperQKVBias(QKVBias);
-        grad_node->SetTensorWrapperQKVBiasOut(QKVBiasOut);
+        grad_node->SetTensorWrapper_QKVBias(QKVBias);
+        grad_node->SetTensorWrapper_QKVBiasOut(QKVBiasOut);
         grad_node->SetGradOutMeta(QKVBias, 4);
 
         auto QKVBiasOut_accumulation_node =
@@ -436,8 +436,8 @@ fused_attention_dygraph_function(
       }
 
       if (SrcMask.initialized()) {
-        grad_node->SetTensorWrapperSrcMask(SrcMask);
-        grad_node->SetTensorWrapperSrcMaskOut(SrcMaskOut);
+        grad_node->SetTensorWrapper_SrcMask(SrcMask);
+        grad_node->SetTensorWrapper_SrcMaskOut(SrcMaskOut);
 
         auto SrcMaskOut_accumulation_node =
             std::make_shared<egr::GradNodeAccumulation>(p_autograd_SrcMaskOut);
@@ -449,21 +449,21 @@ fused_attention_dygraph_function(
       }
 
       if (OutLinearBias.initialized()) {
-        grad_node->SetTensorWrapperOutLinearBias(OutLinearBias);
+        grad_node->SetTensorWrapper_OutLinearBias(OutLinearBias);
         grad_node->SetGradOutMeta(OutLinearBias, 8);
       }
 
       if (pre_layer_norm) {
         if (LnScale.initialized()) {
-          grad_node->SetTensorWrapperLnScale(LnScale);
+          grad_node->SetTensorWrapper_LnScale(LnScale);
           grad_node->SetGradOutMeta(LnScale, 1);
         }
         if (LnBias.initialized()) {
-          grad_node->SetTensorWrapperLnBias(LnBias);
+          grad_node->SetTensorWrapper_LnBias(LnBias);
           grad_node->SetGradOutMeta(LnBias, 2);
         }
         if (LnOut.initialized()) {
-          grad_node->SetTensorWrapperLnOut(LnOut);
+          grad_node->SetTensorWrapper_LnOut(LnOut);
 
           auto LnOut_accumulation_node =
               std::make_shared<egr::GradNodeAccumulation>(p_autograd_LnOut);
@@ -474,24 +474,24 @@ fused_attention_dygraph_function(
           grad_node->SetGradOutMeta(LnOut, 13);
         }
         if (LnMean.initialized()) {
-          grad_node->SetTensorWrapperLnMean(LnMean);
+          grad_node->SetTensorWrapper_LnMean(LnMean);
         }
         if (LnVariance.initialized()) {
-          grad_node->SetTensorWrapperLnVariance(LnVariance);
+          grad_node->SetTensorWrapper_LnVariance(LnVariance);
         }
       } else {
         if (Ln2Scale.initialized()) {
-          grad_node->SetTensorWrapperLn2Scale(Ln2Scale);
+          grad_node->SetTensorWrapper_Ln2Scale(Ln2Scale);
           grad_node->SetGradOutMeta(Ln2Scale, 9);
         }
         if (Ln2Bias.initialized()) {
-          grad_node->SetTensorWrapperLn2Bias(Ln2Bias);
+          grad_node->SetTensorWrapper_Ln2Bias(Ln2Bias);
           grad_node->SetGradOutMeta(Ln2Bias, 10);
         }
-        grad_node->SetTensorWrapperBiasDropoutResidualOut(
+        grad_node->SetTensorWrapper_BiasDropoutResidualOut(
             BiasDropoutResidualOut);
-        grad_node->SetTensorWrapperLn2Mean(Ln2Mean);
-        grad_node->SetTensorWrapperLn2Variance(Ln2Variance);
+        grad_node->SetTensorWrapper_Ln2Mean(Ln2Mean);
+        grad_node->SetTensorWrapper_Ln2Variance(Ln2Variance);
 
         auto BiasDropoutResidualOut_accumulation_node =
             std::make_shared<egr::GradNodeAccumulation>(
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
index c76073ba0b574..b67d0b40b7d0d 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_bias_dropout_residual_layer_norm_fwd_func.cc
@@ -193,15 +193,16 @@ fused_bias_dropout_residual_layer_norm_dygraph_function(
       grad_node->SetDefaultAttrMap(std::move(default_attrs));
 
       // Set Tensor Wrappers
-      grad_node->SetTensorWrapperBias(Bias);
-      grad_node->SetTensorWrapperBiasDropoutResidualOut(BiasDropoutResidualOut);
-      grad_node->SetTensorWrapperDropoutMaskOut(DropoutMaskOut);
-      grad_node->SetTensorWrapperLnBias(LnBias);
-      grad_node->SetTensorWrapperLnMean(LnMean);
-      grad_node->SetTensorWrapperLnScale(LnScale);
-      grad_node->SetTensorWrapperLnVariance(LnVariance);
-      grad_node->SetTensorWrapperResidual(Residual);
-      grad_node->SetTensorWrapperX(X);
+      grad_node->SetTensorWrapper_Bias(Bias);
+      grad_node->SetTensorWrapper_BiasDropoutResidualOut(
+          BiasDropoutResidualOut);
+      grad_node->SetTensorWrapper_DropoutMaskOut(DropoutMaskOut);
+      grad_node->SetTensorWrapper_LnBias(LnBias);
+      grad_node->SetTensorWrapper_LnMean(LnMean);
+      grad_node->SetTensorWrapper_LnScale(LnScale);
+      grad_node->SetTensorWrapper_LnVariance(LnVariance);
+      grad_node->SetTensorWrapper_Residual(Residual);
+      grad_node->SetTensorWrapper_X(X);
 
       grad_node->SetGradOutMeta(X, 0);
       grad_node->SetGradOutMeta(Residual, 1);
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
index b2f5238c5be32..f3cfc39d17c7b 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
@@ -323,15 +323,15 @@ fused_feedforward_dygraph_function(
       grad_node->SetAttrMap(std::move(attrs));
       grad_node->SetDefaultAttrMap(std::move(default_attrs));
 
-      grad_node->SetTensorWrapperX(X);
-      grad_node->SetTensorWrapperLinear1Weight(Linear1Weight);
-      grad_node->SetTensorWrapperLinear1Bias(Linear1Bias);
-      grad_node->SetTensorWrapperLinear2Weight(Linear2Weight);
-      grad_node->SetTensorWrapperDropout1Mask(Dropout1Mask);
-      grad_node->SetTensorWrapperDropout2Mask(Dropout2Mask);
-      grad_node->SetTensorWrapperLinear1Out(Linear1Out);
-      grad_node->SetTensorWrapperDropout1Out(Dropout1Out);
-      grad_node->SetTensorWrapperDropout2Out(Dropout2Out);
+      grad_node->SetTensorWrapper_X(X);
+      grad_node->SetTensorWrapper_Linear1Weight(Linear1Weight);
+      grad_node->SetTensorWrapper_Linear1Bias(Linear1Bias);
+      grad_node->SetTensorWrapper_Linear2Weight(Linear2Weight);
+      grad_node->SetTensorWrapper_Dropout1Mask(Dropout1Mask);
+      grad_node->SetTensorWrapper_Dropout2Mask(Dropout2Mask);
+      grad_node->SetTensorWrapper_Linear1Out(Linear1Out);
+      grad_node->SetTensorWrapper_Dropout1Out(Dropout1Out);
+      grad_node->SetTensorWrapper_Dropout2Out(Dropout2Out);
 
       grad_node->SetGradOutMeta(X, 0);
       grad_node->SetGradOutMeta(Linear1Weight, 3);
@@ -339,24 +339,24 @@ fused_feedforward_dygraph_function(
       grad_node->SetGradOutMeta(Linear2Weight, 5);
 
       if (pre_layer_norm) {
-        grad_node->SetTensorWrapperLn1Scale(Ln1Scale);
-        grad_node->SetTensorWrapperLn1Bias(Ln1Bias);
-        grad_node->SetTensorWrapperLn1Out(Ln1Out);
-        grad_node->SetTensorWrapperLn1Mean(Ln1Mean);
-        grad_node->SetTensorWrapperLn1Variance(Ln1Variance);
+        grad_node->SetTensorWrapper_Ln1Scale(Ln1Scale);
+        grad_node->SetTensorWrapper_Ln1Bias(Ln1Bias);
+        grad_node->SetTensorWrapper_Ln1Out(Ln1Out);
+        grad_node->SetTensorWrapper_Ln1Mean(Ln1Mean);
+        grad_node->SetTensorWrapper_Ln1Variance(Ln1Variance);
         grad_node->SetGradOutMeta(Ln1Scale, 7);
         grad_node->SetGradOutMeta(Ln1Bias, 8);
       } else {
-        grad_node->SetTensorWrapperLn2Scale(Ln2Scale);
+        grad_node->SetTensorWrapper_Ln2Scale(Ln2Scale);
         grad_node->SetGradOutMeta(Ln2Scale, 9);
-        grad_node->SetTensorWrapperLn2Bias(Ln2Bias);
+        grad_node->SetTensorWrapper_Ln2Bias(Ln2Bias);
         grad_node->SetGradOutMeta(Ln2Bias, 10);
-        grad_node->SetTensorWrapperLn2Mean(Ln2Mean);
-        grad_node->SetTensorWrapperLn2Variance(Ln2Variance);
+        grad_node->SetTensorWrapper_Ln2Mean(Ln2Mean);
+        grad_node->SetTensorWrapper_Ln2Variance(Ln2Variance);
       }
 
       if (Linear2Bias.initialized()) {
-        grad_node->SetTensorWrapperLinear2Bias(Linear2Bias);
+        grad_node->SetTensorWrapper_Linear2Bias(Linear2Bias);
         grad_node->SetGradOutMeta(Linear2Bias, 6);
       }
 
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
index c42a099cef4b0..b9e2a52228bcb 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
@@ -324,28 +324,28 @@ fused_gate_attention_dygraph_function(
       grad_node->SetAttrMap(std::move(attrs));
       grad_node->SetDefaultAttrMap(std::move(default_attrs));
 
-      grad_node->SetTensorWrapperFMHAOut(FMHAOut);
-      grad_node->SetTensorWrapperQuery(Query);
-      grad_node->SetTensorWrapperSoftmaxOut(SoftmaxOut);
-      grad_node->SetTensorWrapperOutLinearBias(OutLinearBias);
-      grad_node->SetTensorWrapperOutLinearWeight(OutLinearWeight);
+      grad_node->SetTensorWrapper_FMHAOut(FMHAOut);
+      grad_node->SetTensorWrapper_Query(Query);
+      grad_node->SetTensorWrapper_SoftmaxOut(SoftmaxOut);
+      grad_node->SetTensorWrapper_OutLinearBias(OutLinearBias);
+      grad_node->SetTensorWrapper_OutLinearWeight(OutLinearWeight);
 
       grad_node->SetGradOutMeta(Query, 0);
       grad_node->SetGradOutMeta(OutLinearWeight, 10);
       grad_node->SetGradOutMeta(OutLinearBias, 11);
 
       if (merge_qkv) {
-        grad_node->SetTensorWrapperQKVTransposeOut(QKVTransposeOut);
-        grad_node->SetTensorWrapperQKVWeight(QKVWeight);
+        grad_node->SetTensorWrapper_QKVTransposeOut(QKVTransposeOut);
+        grad_node->SetTensorWrapper_QKVWeight(QKVWeight);
         grad_node->SetGradOutMeta(QKVWeight, 5);
       } else {
-        grad_node->SetTensorWrapperKey(Key);
-        grad_node->SetTensorWrapperQueryWeight(QueryWeight);
-        grad_node->SetTensorWrapperKeyWeight(KeyWeight);
-        grad_node->SetTensorWrapperValueWeight(ValueWeight);
-        grad_node->SetTensorWrapperQueryTransposeOut(QueryTransposeOut);
-        grad_node->SetTensorWrapperKeyTransposeOut(KeyTransposeOut);
-        grad_node->SetTensorWrapperValueTransposeOut(ValueTransposeOut);
+        grad_node->SetTensorWrapper_Key(Key);
+        grad_node->SetTensorWrapper_QueryWeight(QueryWeight);
+        grad_node->SetTensorWrapper_KeyWeight(KeyWeight);
+        grad_node->SetTensorWrapper_ValueWeight(ValueWeight);
+        grad_node->SetTensorWrapper_QueryTransposeOut(QueryTransposeOut);
+        grad_node->SetTensorWrapper_KeyTransposeOut(KeyTransposeOut);
+        grad_node->SetTensorWrapper_ValueTransposeOut(ValueTransposeOut);
 
         grad_node->SetGradOutMeta(Key, 1);
         grad_node->SetGradOutMeta(QueryWeight, 2);
@@ -354,21 +354,21 @@ fused_gate_attention_dygraph_function(
       }
 
       if (has_gating) {
-        grad_node->SetTensorWrapperGateWeight(GateWeight);
+        grad_node->SetTensorWrapper_GateWeight(GateWeight);
         grad_node->SetGradOutMeta(GateWeight, 8);
-        grad_node->SetTensorWrapperGateBias(GateBias);
+        grad_node->SetTensorWrapper_GateBias(GateBias);
         grad_node->SetGradOutMeta(GateBias, 9);
-        grad_node->SetTensorWrapperGateOut(GateOut);
+        grad_node->SetTensorWrapper_GateOut(GateOut);
       }
 
       if (NonbatchedBias.initialized()) {
-        grad_node->SetTensorWrapperNonbatchedBias(NonbatchedBias);
+        grad_node->SetTensorWrapper_NonbatchedBias(NonbatchedBias);
         grad_node->SetGradOutMeta(NonbatchedBias, 6);
       }
 
       if (use_flash_attn) {
-        grad_node->SetTensorWrapperSoftmaxLse(SoftmaxLse);
-        grad_node->SetTensorWrapperSrcMask(SrcMask);
+        grad_node->SetTensorWrapper_SoftmaxLse(SoftmaxLse);
+        grad_node->SetTensorWrapper_SrcMask(SrcMask);
         grad_node->SetGradOutMeta(SrcMask, 7);
       }
 
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
index c4ae0840c294f..15c0fdfd0d1ff 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
@@ -111,8 +111,8 @@ paddle::Tensor fused_gemm_epilogue_dygraph_function(
       grad_node->SetDefaultAttrMap(std::move(default_attrs));
 
       // Set Tensor Wrappers
-      grad_node->SetTensorWrapperX(X);
-      grad_node->SetTensorWrapperY(Y);
+      grad_node->SetTensorWrapper_X(X);
+      grad_node->SetTensorWrapper_Y(Y);
 
       grad_node->SetGradOutMeta(X, 0);
       grad_node->SetGradOutMeta(Y, 1);
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
index 212f9d9f1da19..e8c80e635b155 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
+++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
@@ -90,65 +90,65 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase {
   }
 
   // SetX, SetY, ...
-  void SetTensorWrapperFMHAOut(const paddle::Tensor& FMHAOut) {
+  void SetTensorWrapper_FMHAOut(const paddle::Tensor& FMHAOut) {
     FMHAOut_ = egr::TensorWrapper(FMHAOut, false);
   }
-  void SetTensorWrapperGateBias(const paddle::Tensor& GateBias) {
+  void SetTensorWrapper_GateBias(const paddle::Tensor& GateBias) {
     GateBias_ = egr::TensorWrapper(GateBias, false);
   }
-  void SetTensorWrapperGateOut(const paddle::Tensor& GateOut) {
+  void SetTensorWrapper_GateOut(const paddle::Tensor& GateOut) {
     GateOut_ = egr::TensorWrapper(GateOut, false);
   }
-  void SetTensorWrapperGateWeight(const paddle::Tensor& GateWeight) {
+  void SetTensorWrapper_GateWeight(const paddle::Tensor& GateWeight) {
     GateWeight_ = egr::TensorWrapper(GateWeight, false);
   }
-  void SetTensorWrapperNonbatchedBias(const paddle::Tensor& NonbatchedBias) {
+  void SetTensorWrapper_NonbatchedBias(const paddle::Tensor& NonbatchedBias) {
     NonbatchedBias_ = egr::TensorWrapper(NonbatchedBias, false);
   }
-  void SetTensorWrapperSrcMask(const paddle::Tensor& SrcMask) {
+  void SetTensorWrapper_SrcMask(const paddle::Tensor& SrcMask) {
     SrcMask_ = egr::TensorWrapper(SrcMask, false);
   }
-  void SetTensorWrapperOutLinearBias(const paddle::Tensor& OutLinearBias) {
+  void SetTensorWrapper_OutLinearBias(const paddle::Tensor& OutLinearBias) {
     OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false);
   }
-  void SetTensorWrapperOutLinearWeight(const paddle::Tensor& OutLinearWeight) {
+  void SetTensorWrapper_OutLinearWeight(const paddle::Tensor& OutLinearWeight) {
     OutLinearWeight_ = egr::TensorWrapper(OutLinearWeight, false);
   }
-  void SetTensorWrapperQKVTransposeOut(const paddle::Tensor& QKVTransposeOut) {
+  void SetTensorWrapper_QKVTransposeOut(const paddle::Tensor& QKVTransposeOut) {
     QKVTransposeOut_ = egr::TensorWrapper(QKVTransposeOut, false);
   }
-  void SetTensorWrapperQKVWeight(const paddle::Tensor& QKVWeight) {
+  void SetTensorWrapper_QKVWeight(const paddle::Tensor& QKVWeight) {
     QKVWeight_ = egr::TensorWrapper(QKVWeight, false);
   }
-  void SetTensorWrapperQuery(const paddle::Tensor& Query) {
+  void SetTensorWrapper_Query(const paddle::Tensor& Query) {
     Query_ = egr::TensorWrapper(Query, false);
   }
-  void SetTensorWrapperSoftmaxOut(const paddle::Tensor& SoftmaxOut) {
+  void SetTensorWrapper_SoftmaxOut(const paddle::Tensor& SoftmaxOut) {
     SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false);
   }
-  void SetTensorWrapperSoftmaxLse(const paddle::Tensor& SoftmaxLse) {
+  void SetTensorWrapper_SoftmaxLse(const paddle::Tensor& SoftmaxLse) {
     SoftmaxLse_ = egr::TensorWrapper(SoftmaxLse, false);
   }
-  void SetTensorWrapperKey(const paddle::Tensor& Key) {
+  void SetTensorWrapper_Key(const paddle::Tensor& Key) {
     Key_ = egr::TensorWrapper(Key, false);
   }
-  void SetTensorWrapperQueryWeight(const paddle::Tensor& QueryWeight) {
+  void SetTensorWrapper_QueryWeight(const paddle::Tensor& QueryWeight) {
     QueryWeight_ = egr::TensorWrapper(QueryWeight, false);
   }
-  void SetTensorWrapperKeyWeight(const paddle::Tensor& KeyWeight) {
+  void SetTensorWrapper_KeyWeight(const paddle::Tensor& KeyWeight) {
     KeyWeight_ = egr::TensorWrapper(KeyWeight, false);
   }
-  void SetTensorWrapperValueWeight(const paddle::Tensor& ValueWeight) {
+  void SetTensorWrapper_ValueWeight(const paddle::Tensor& ValueWeight) {
     ValueWeight_ = egr::TensorWrapper(ValueWeight, false);
   }
-  void SetTensorWrapperQueryTransposeOut(
+  void SetTensorWrapper_QueryTransposeOut(
       const paddle::Tensor& QueryTransposeOut) {
     QueryTransposeOut_ = egr::TensorWrapper(QueryTransposeOut, false);
   }
-  void SetTensorWrapperKeyTransposeOut(const paddle::Tensor& KeyTransposeOut) {
+  void SetTensorWrapper_KeyTransposeOut(const paddle::Tensor& KeyTransposeOut) {
     KeyTransposeOut_ = egr::TensorWrapper(KeyTransposeOut, false);
   }
-  void SetTensorWrapperValueTransposeOut(
+  void SetTensorWrapper_ValueTransposeOut(
       const paddle::Tensor& ValueTransposeOut) {
     ValueTransposeOut_ = egr::TensorWrapper(ValueTransposeOut, false);
   }
@@ -240,63 +240,63 @@ class fused_feedforwardGradNodeCompat : public egr::GradNodeBase {
   }
 
   // SetX, SetY, ...
-  void SetTensorWrapperDropout1Mask(const paddle::Tensor& Dropout1Mask) {
+  void SetTensorWrapper_Dropout1Mask(const paddle::Tensor& Dropout1Mask) {
     Dropout1Mask_ = egr::TensorWrapper(Dropout1Mask, false);
   }
-  void SetTensorWrapperDropout1Out(const paddle::Tensor& Dropout1Out) {
+  void SetTensorWrapper_Dropout1Out(const paddle::Tensor& Dropout1Out) {
     Dropout1Out_ = egr::TensorWrapper(Dropout1Out, false);
   }
-  void SetTensorWrapperDropout2Mask(const paddle::Tensor& Dropout2Mask) {
+  void SetTensorWrapper_Dropout2Mask(const paddle::Tensor& Dropout2Mask) {
     Dropout2Mask_ = egr::TensorWrapper(Dropout2Mask, false);
   }
-  void SetTensorWrapperDropout2Out(const paddle::Tensor& Dropout2Out) {
+  void SetTensorWrapper_Dropout2Out(const paddle::Tensor& Dropout2Out) {
     auto pre_layer_norm = GetAttrWithDefault<bool>(
         attr_map_, default_attr_map_, "pre_layer_norm");
     Dropout2Out_ = egr::TensorWrapper(Dropout2Out, pre_layer_norm);
   }
-  void SetTensorWrapperLinear1Bias(const paddle::Tensor& Linear1Bias) {
+  void SetTensorWrapper_Linear1Bias(const paddle::Tensor& Linear1Bias) {
     Linear1Bias_ = egr::TensorWrapper(Linear1Bias, false);
   }
-  void SetTensorWrapperLinear1Out(const paddle::Tensor& Linear1Out) {
+  void SetTensorWrapper_Linear1Out(const paddle::Tensor& Linear1Out) {
     Linear1Out_ = egr::TensorWrapper(Linear1Out, false);
   }
-  void SetTensorWrapperLinear1Weight(const paddle::Tensor& Linear1Weight) {
+  void SetTensorWrapper_Linear1Weight(const paddle::Tensor& Linear1Weight) {
     Linear1Weight_ = egr::TensorWrapper(Linear1Weight, false);
   }
-  void SetTensorWrapperLinear2Bias(const paddle::Tensor& Linear2Bias) {
+  void SetTensorWrapper_Linear2Bias(const paddle::Tensor& Linear2Bias) {
     Linear2Bias_ = egr::TensorWrapper(Linear2Bias, false);
   }
-  void SetTensorWrapperLinear2Weight(const paddle::Tensor& Linear2Weight) {
+  void SetTensorWrapper_Linear2Weight(const paddle::Tensor& Linear2Weight) {
     Linear2Weight_ = egr::TensorWrapper(Linear2Weight, false);
   }
-  void SetTensorWrapperLn2Bias(const paddle::Tensor& Ln2Bias) {
+  void SetTensorWrapper_Ln2Bias(const paddle::Tensor& Ln2Bias) {
     Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false);
   }
-  void SetTensorWrapperLn2Mean(const paddle::Tensor& Ln2Mean) {
+  void SetTensorWrapper_Ln2Mean(const paddle::Tensor& Ln2Mean) {
     Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false);
   }
-  void SetTensorWrapperLn2Scale(const paddle::Tensor& Ln2Scale) {
+  void SetTensorWrapper_Ln2Scale(const paddle::Tensor& Ln2Scale) {
     Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false);
   }
-  void SetTensorWrapperLn2Variance(const paddle::Tensor& Ln2Variance) {
+  void SetTensorWrapper_Ln2Variance(const paddle::Tensor& Ln2Variance) {
     Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false);
   }
-  void SetTensorWrapperX(const paddle::Tensor& X) {
+  void SetTensorWrapper_X(const paddle::Tensor& X) {
     X_ = egr::TensorWrapper(X, false);
   }
-  void SetTensorWrapperLn1Scale(const paddle::Tensor& Ln1Scale) {
+  void SetTensorWrapper_Ln1Scale(const paddle::Tensor& Ln1Scale) {
     Ln1Scale_ = egr::TensorWrapper(Ln1Scale, false);
   }
-  void SetTensorWrapperLn1Bias(const paddle::Tensor& Ln1Bias) {
+  void SetTensorWrapper_Ln1Bias(const paddle::Tensor& Ln1Bias) {
     Ln1Bias_ = egr::TensorWrapper(Ln1Bias, false);
   }
-  void SetTensorWrapperLn1Out(const paddle::Tensor& Ln1Out) {
+  void SetTensorWrapper_Ln1Out(const paddle::Tensor& Ln1Out) {
     Ln1Out_ = egr::TensorWrapper(Ln1Out, false);
   }
-  void SetTensorWrapperLn1Mean(const paddle::Tensor& Ln1Mean) {
+  void SetTensorWrapper_Ln1Mean(const paddle::Tensor& Ln1Mean) {
     Ln1Mean_ = egr::TensorWrapper(Ln1Mean, false);
   }
-  void SetTensorWrapperLn1Variance(const paddle::Tensor& Ln1Variance) {
+  void SetTensorWrapper_Ln1Variance(const paddle::Tensor& Ln1Variance) {
     Ln1Variance_ = egr::TensorWrapper(Ln1Variance, false);
   }
   // SetAttrMap
@@ -393,90 +393,90 @@ class fused_attentionGradNodeCompat : public egr::GradNodeBase {
   }
 
   // SetX, SetY, ...
-  void SetTensorWrapperAttnDropoutMaskOut(
+  void SetTensorWrapper_AttnDropoutMaskOut(
       const paddle::Tensor& AttnDropoutMaskOut) {
     AttnDropoutMaskOut_ = egr::TensorWrapper(AttnDropoutMaskOut, false);
   }
-  void SetTensorWrapperAttnDropoutOut(const paddle::Tensor& AttnDropoutOut) {
+  void SetTensorWrapper_AttnDropoutOut(const paddle::Tensor& AttnDropoutOut) {
     AttnDropoutOut_ = egr::TensorWrapper(AttnDropoutOut, false);
   }
-  void SetTensorWrapperBiasDropoutResidualOut(
+  void SetTensorWrapper_BiasDropoutResidualOut(
       const paddle::Tensor& BiasDropoutResidualOut) {
     BiasDropoutResidualOut_ = egr::TensorWrapper(BiasDropoutResidualOut, false);
   }
-  void SetTensorWrapperDropoutMaskOut(const paddle::Tensor& DropoutMaskOut) {
+  void SetTensorWrapper_DropoutMaskOut(const paddle::Tensor& DropoutMaskOut) {
     DropoutMaskOut_ = egr::TensorWrapper(DropoutMaskOut, false);
   }
-  void SetTensorWrapperFMHAOut(const paddle::Tensor& FMHAOut) {
+  void SetTensorWrapper_FMHAOut(const paddle::Tensor& FMHAOut) {
     FMHAOut_ = egr::TensorWrapper(FMHAOut, false);
   }
-  void SetTensorWrapperLn2Bias(const paddle::Tensor& Ln2Bias) {
+  void SetTensorWrapper_Ln2Bias(const paddle::Tensor& Ln2Bias) {
     Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false);
   }
-  void SetTensorWrapperLn2Mean(const paddle::Tensor& Ln2Mean) {
+  void SetTensorWrapper_Ln2Mean(const paddle::Tensor& Ln2Mean) {
     Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false);
   }
-  void SetTensorWrapperLn2Scale(const paddle::Tensor& Ln2Scale) {
+  void SetTensorWrapper_Ln2Scale(const paddle::Tensor& Ln2Scale) {
     Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false);
   }
-  void SetTensorWrapperLn2Variance(const paddle::Tensor& Ln2Variance) {
+  void SetTensorWrapper_Ln2Variance(const paddle::Tensor& Ln2Variance) {
     Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false);
   }
-  void SetTensorWrapperOutLinearBias(const paddle::Tensor& OutLinearBias) {
+  void SetTensorWrapper_OutLinearBias(const paddle::Tensor& OutLinearBias) {
     OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false);
   }
-  void SetTensorWrapperOutLinearOut(const paddle::Tensor& OutLinearOut) {
+  void SetTensorWrapper_OutLinearOut(const paddle::Tensor& OutLinearOut) {
     OutLinearOut_ = egr::TensorWrapper(OutLinearOut, true);
   }
-  void SetTensorWrapperOutLinearW(const paddle::Tensor& OutLinearW) {
+  void SetTensorWrapper_OutLinearW(const paddle::Tensor& OutLinearW) {
     OutLinearW_ = egr::TensorWrapper(OutLinearW, false);
   }
-  void SetTensorWrapperQKOut(const paddle::Tensor& QKOut) {
+  void SetTensorWrapper_QKOut(const paddle::Tensor& QKOut) {
     QKOut_ = egr::TensorWrapper(QKOut, true);
   }
-  void SetTensorWrapperQKTVOut(const paddle::Tensor& QKTVOut) {
+  void SetTensorWrapper_QKTVOut(const paddle::Tensor& QKTVOut) {
     QKTVOut_ = egr::TensorWrapper(QKTVOut, true);
   }
-  void SetTensorWrapperQKVBias(const paddle::Tensor& QKVBias) {
+  void SetTensorWrapper_QKVBias(const paddle::Tensor& QKVBias) {
     QKVBias_ = egr::TensorWrapper(QKVBias, false);
   }
-  void SetTensorWrapperQKVBiasOut(const paddle::Tensor& QKVBiasOut) {
+  void SetTensorWrapper_QKVBiasOut(const paddle::Tensor& QKVBiasOut) {
     QKVBiasOut_ = egr::TensorWrapper(QKVBiasOut, true);
   }
-  void SetTensorWrapperQKVOut(const paddle::Tensor& QKVOut) {
+  void SetTensorWrapper_QKVOut(const paddle::Tensor& QKVOut) {
     QKVOut_ = egr::TensorWrapper(QKVOut, true);
   }
-  void SetTensorWrapperQKVW(const paddle::Tensor& QKVW) {
+  void SetTensorWrapper_QKVW(const paddle::Tensor& QKVW) {
     QKVW_ = egr::TensorWrapper(QKVW, false);
   }
-  void SetTensorWrapperSoftmaxOut(const paddle::Tensor& SoftmaxOut) {
+  void SetTensorWrapper_SoftmaxOut(const paddle::Tensor& SoftmaxOut) {
     SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false);
   }
-  void SetTensorWrapperSrcMask(const paddle::Tensor& SrcMask) {
+  void SetTensorWrapper_SrcMask(const paddle::Tensor& SrcMask) {
     SrcMask_ = egr::TensorWrapper(SrcMask, true);
   }
-  void SetTensorWrapperSrcMaskOut(const paddle::Tensor& SrcMaskOut) {
+  void SetTensorWrapper_SrcMaskOut(const paddle::Tensor& SrcMaskOut) {
     SrcMaskOut_ = egr::TensorWrapper(SrcMaskOut, false);
   }
-  void SetTensorWrapperTransposeOut2(const paddle::Tensor& TransposeOut2) {
+  void SetTensorWrapper_TransposeOut2(const paddle::Tensor& TransposeOut2) {
     TransposeOut2_ = egr::TensorWrapper(TransposeOut2, false);
   }
-  void SetTensorWrapperX(const paddle::Tensor& X) {
+  void SetTensorWrapper_X(const paddle::Tensor& X) {
     X_ = egr::TensorWrapper(X, false);
   }
-  void SetTensorWrapperLnScale(const paddle::Tensor& LnScale) {
+  void SetTensorWrapper_LnScale(const paddle::Tensor& LnScale) {
     LnScale_ = egr::TensorWrapper(LnScale, false);
   }
-  void SetTensorWrapperLnBias(const paddle::Tensor& LnBias) {
+  void SetTensorWrapper_LnBias(const paddle::Tensor& LnBias) {
     LnBias_ = egr::TensorWrapper(LnBias, false);
   }
-  void SetTensorWrapperLnOut(const paddle::Tensor& LnOut) {
+  void SetTensorWrapper_LnOut(const paddle::Tensor& LnOut) {
     LnOut_ = egr::TensorWrapper(LnOut, false);
   }
-  void SetTensorWrapperLnMean(const paddle::Tensor& LnMean) {
+  void SetTensorWrapper_LnMean(const paddle::Tensor& LnMean) {
     LnMean_ = egr::TensorWrapper(LnMean, false);
   }
-  void SetTensorWrapperLnVariance(const paddle::Tensor& LnVariance) {
+  void SetTensorWrapper_LnVariance(const paddle::Tensor& LnVariance) {
     LnVariance_ = egr::TensorWrapper(LnVariance, false);
   }
 
@@ -563,10 +563,10 @@ class fused_gemm_epilogueGradNodeCompat : public egr::GradNodeBase {
   }
 
   // SetX, SetY, ...
-  void SetTensorWrapperX(const paddle::Tensor& X) {
+  void SetTensorWrapper_X(const paddle::Tensor& X) {
     X_ = egr::TensorWrapper(X, false);
   }
-  void SetTensorWrapperY(const paddle::Tensor& Y) {
+  void SetTensorWrapper_Y(const paddle::Tensor& Y) {
     Y_ = egr::TensorWrapper(Y, false);
   }
 
@@ -640,32 +640,32 @@ class fused_bias_dropout_residual_layer_normGradNodeCompat
   }
 
   // SetX, SetY, ...
-  void SetTensorWrapperBias(const paddle::Tensor& Bias) {
+  void SetTensorWrapper_Bias(const paddle::Tensor& Bias) {
     Bias_ = egr::TensorWrapper(Bias, false);
   }
-  void SetTensorWrapperBiasDropoutResidualOut(
+  void SetTensorWrapper_BiasDropoutResidualOut(
       const paddle::Tensor& BiasDropoutResidualOut) {
     BiasDropoutResidualOut_ = egr::TensorWrapper(BiasDropoutResidualOut, false);
   }
-  void SetTensorWrapperDropoutMaskOut(const paddle::Tensor& DropoutMaskOut) {
+  void SetTensorWrapper_DropoutMaskOut(const paddle::Tensor& DropoutMaskOut) {
     DropoutMaskOut_ = egr::TensorWrapper(DropoutMaskOut, false);
   }
-  void SetTensorWrapperLnBias(const paddle::Tensor& LnBias) {
+  void SetTensorWrapper_LnBias(const paddle::Tensor& LnBias) {
     LnBias_ = egr::TensorWrapper(LnBias, false);
   }
-  void SetTensorWrapperLnMean(const paddle::Tensor& LnMean) {
+  void SetTensorWrapper_LnMean(const paddle::Tensor& LnMean) {
     LnMean_ = egr::TensorWrapper(LnMean, false);
   }
-  void SetTensorWrapperLnScale(const paddle::Tensor& LnScale) {
+  void SetTensorWrapper_LnScale(const paddle::Tensor& LnScale) {
     LnScale_ = egr::TensorWrapper(LnScale, false);
   }
-  void SetTensorWrapperLnVariance(const paddle::Tensor& LnVariance) {
+  void SetTensorWrapper_LnVariance(const paddle::Tensor& LnVariance) {
     LnVariance_ = egr::TensorWrapper(LnVariance, false);
   }
-  void SetTensorWrapperResidual(const paddle::Tensor& Residual) {
+  void SetTensorWrapper_Residual(const paddle::Tensor& Residual) {
     Residual_ = egr::TensorWrapper(Residual, false);
   }
-  void SetTensorWrapperX(const paddle::Tensor& X) {
+  void SetTensorWrapper_X(const paddle::Tensor& X) {
     X_ = egr::TensorWrapper(X, false);
   }
 
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index b9e04b3e318ac..66b4d05f68bf0 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1204,7 +1204,7 @@ static std::string GenerateGradNodeCreationContent(
     for (auto& kv : grad_ins_fwd_slotname_map) {
       const std::string& tensor_wrapper_name = kv.second;
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "      grad_node->SetTensorWrapper%s(%s);\n";
+          "      grad_node->SetTensorWrapper_%s(%s);\n";
       // Replace output directly with input in inplace op.
       if (!forward_inplace_map.empty() &&
           forward_inplace_map.count(tensor_wrapper_name)) {
@@ -2941,7 +2941,7 @@ static std::string GenerateGradNodeHeaderContents(
             CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
       }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "   void SetTensorWrapper%s(%s) {\n    %s\n  }\n";
+          "   void SetTensorWrapper_%s(%s) {\n    %s\n  }\n";
       set_tensor_wrappers_str +=
           paddle::string::Sprintf(SET_TENSOR_WRAPPER_TEMPLATE,
                                   tensor_wrapper_name,
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index e1ad1a0dc81b2..dad46949d70ea 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -135,12 +135,12 @@ def ParseArguments():
 ######################
 # Code Gen Templates #
 ######################
-SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """  void SetTensorWrapper{}(const paddle::Tensor& {}) {{
+SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """  void SetTensorWrapper_{}(const paddle::Tensor& {}) {{
     {} = egr::TensorWrapper({}, {});
   }}
 """
 
-SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """  void SetTensorWrapper{}(const std::vector<paddle::Tensor>& {}) {{
+SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """  void SetTensorWrapper_{}(const std::vector<paddle::Tensor>& {}) {{
     for(const auto& eager_tensor : {}) {{
       {}.emplace_back(egr::TensorWrapper(eager_tensor, {}));
     }};
@@ -161,7 +161,7 @@ def ParseArguments():
     }}
 """
 
-SET_ATTR_METHOD_TEMPLATE = """  void SetAttribute{}({} {}) {{
+SET_ATTR_METHOD_TEMPLATE = """  void SetAttribute_{}({} {}) {{
     {} = {};
   }}
 """
@@ -1062,10 +1062,10 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
         for name, _, default_val_attr, _ in backward_attrs_list:
             if name in forward_attrs_name_set:
                 set_attributes = (
-                    f"{indent}grad_node->SetAttribute{name}({name});"
+                    f"{indent}grad_node->SetAttribute_{name}({name});"
                 )
             else:
-                set_attributes = f"{indent}grad_node->SetAttribute{name}({default_val_attr});"
+                set_attributes = f"{indent}grad_node->SetAttribute_{name}({default_val_attr});"
             set_attributes_list.append(set_attributes)
         set_attributes_str = "\n".join(set_attributes_list)
 
@@ -1089,7 +1089,7 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
                     if is_inplace_input:
                         set_tensor_wrappers = """{indent}if({name}) {
                                                             auto {name}_clone = paddle::experimental::assign({name});
-                                                            grad_node->SetTensorWrapper{name}(*{name}_clone);}""".format_map(
+                                                            grad_node->SetTensorWrapper_{name}(*{name}_clone);}""".format_map(
                             {"indent": indent, "name": name}
                         )
                     else:
@@ -1100,16 +1100,16 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
                             or (name in self.optional_inputs)
                         ):
                             if for_backward is False:
-                                set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name});"
+                                set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper_{name}(*{name});"
                             else:
-                                set_tensor_wrappers = f"{indent}if({name}_optional) grad_node->SetTensorWrapper{name}(*{name}_optional);"
+                                set_tensor_wrappers = f"{indent}if({name}_optional) grad_node->SetTensorWrapper_{name}(*{name}_optional);"
 
                         else:
                             need_pre_contiguous_set.add(name)
-                            set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name}_tmp);"
+                            set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper_{name}(*{name}_tmp);"
                 else:
                     if is_inplace_input:
-                        set_tensor_wrappers = f"{indent}auto {name}_clone = paddle::experimental::assign({name});\n{indent}grad_node->SetTensorWrapper{name}({name}_clone);"
+                        set_tensor_wrappers = f"{indent}auto {name}_clone = paddle::experimental::assign({name});\n{indent}grad_node->SetTensorWrapper_{name}({name}_clone);"
                     else:
                         if (
                             (forward_api_name in strided_op_list)
@@ -1117,10 +1117,10 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
                             or IsVectorTensorType(atype)
                             or (name in self.optional_inputs)
                         ):
-                            set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
+                            set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper_{name}({name});"
                         else:
                             need_pre_contiguous_set.add(name)
-                            set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}_tmp);"
+                            set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper_{name}({name}_tmp);"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
             else:  # Forwad's output as backward's input
                 if num_fwd_outputs > 1:
@@ -1130,7 +1130,7 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
                     ), AssertMessage(name, forward_outputs_position_map.keys())
 
                 set_tensor_wrappers = (
-                    f"{indent}grad_node->SetTensorWrapper{name}({name});"
+                    f"{indent}grad_node->SetTensorWrapper_{name}({name});"
                 )
                 set_output_tensor_wrappers_list.append(set_tensor_wrappers)
         set_input_tensor_wrappers_str = "\n".join(
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 6611d108adcf5..2094fef07a873 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1783,13 +1783,13 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
 
           grad_node = std::shared_ptr<SetValueWithTensorGradNode>(
               new SetValueWithTensorGradNode(1, 2));  // NOLINT
-          grad_node->SetAttributestarts(slice_starts);
-          grad_node->SetAttributeends(slice_ends);
-          grad_node->SetAttributesteps(slice_strides);
-          grad_node->SetAttributeaxes(slice_axes);
-          grad_node->SetAttributedecrease_axes(decrease_axis);
-          grad_node->SetAttributenone_axes(none_axes);
-          grad_node->SetTensorWrappervalues(values_tmp);
+          grad_node->SetAttribute_starts(slice_starts);
+          grad_node->SetAttribute_ends(slice_ends);
+          grad_node->SetAttribute_steps(slice_strides);
+          grad_node->SetAttribute_axes(slice_axes);
+          grad_node->SetAttribute_decrease_axes(decrease_axis);
+          grad_node->SetAttribute_none_axes(none_axes);
+          grad_node->SetTensorWrapper_values(values_tmp);
 
           paddle::memory::LogDeviceMemoryStats(
               egr::Controller::Instance().GetExpectedPlace(),

From 52498e2a302cd6f09c126219c04c4879182c26e2 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 12:10:12 +0800
Subject: [PATCH 17/82] Update test_auto_parallel_partitioner.py,
 test=document_fix (#61982)

---
 test/legacy_test/test_auto_parallel_partitioner.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/legacy_test/test_auto_parallel_partitioner.py b/test/legacy_test/test_auto_parallel_partitioner.py
index 1a0d70c232b36..da402d93ce58e 100644
--- a/test/legacy_test/test_auto_parallel_partitioner.py
+++ b/test/legacy_test/test_auto_parallel_partitioner.py
@@ -480,7 +480,7 @@ def test_mlp_mp(self):
         self.assertTrue(
             distributed_attr_check_for_program(dist_main_prog, dist_context)
         )
-        # check distribured attr for dist op
+        # check distributed attr for dist op
         serial_op_idx = [1, 4]
         dist_op_idx = [[1, 2], [4, 5]]
         self.assertTrue(
@@ -573,7 +573,7 @@ def test_mlp_dp_mp(self):
         self.assertTrue(
             distributed_attr_check_for_program(dist_main_prog, dist_context)
         )
-        # check distribured attr for dist op
+        # check distributed attr for dist op
         serial_op_idx = [1, 4]
         dist_op_idx = [[1, 2], [4, 5]]
         self.assertTrue(
@@ -869,7 +869,7 @@ def test_attn_mp(self):
         self.assertTrue(
             distributed_attr_check_for_program(dist_main_prog, dist_context)
         )
-        # check distribured attr for dist op
+        # check distributed attr for dist op
         serial_op_idx = [0, 4, 6, 18]
         dist_op_idx = [[0, 1], [4, 5], [6, 7], [18, 19]]
 
@@ -976,7 +976,7 @@ def test_attn_dp_mp(self):
         self.assertTrue(
             distributed_attr_check_for_program(dist_main_prog, dist_context)
         )
-        # check distribured attr for dist op
+        # check distributed attr for dist op
         serial_op_idx = [0, 4, 6, 18]
         dist_op_idx = [[0, 1], [4, 5], [6, 7], [18, 19]]
 
@@ -1364,7 +1364,7 @@ def test_decoder_dp_mp(self):
         self.assertTrue(
             distributed_attr_check_for_program(dist_main_prog, dist_context)
         )
-        # check distribured attr
+        # check distributed attr
         serial_op_idx = [0, 5, 9, 11, 24, 29, 32]
         dist_op_idx = [
             [2, 3],

From 508c717ed4e54adc05fb3eeacacfccc45ebd741d Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 12:11:03 +0800
Subject: [PATCH 18/82]  Fix some typos (dtyte, paramerter, etc.) (#61996)

---
 test/legacy_test/test_adadelta_op.py          |  2 +-
 test/legacy_test/test_adagrad_op.py           |  4 +-
 test/legacy_test/test_adam_op.py              |  2 +-
 test/legacy_test/test_adamax_op.py            |  2 +-
 test/legacy_test/test_adamw_op.py             |  4 +-
 test/legacy_test/test_add_n_op.py             | 10 ++--
 test/legacy_test/test_arange.py               |  2 +-
 test/legacy_test/test_backward.py             |  6 +--
 test/legacy_test/test_bicubic_interp_v2_op.py |  8 ++--
 test/legacy_test/test_checkpoint_saver.py     |  2 +-
 test/legacy_test/test_collective_api_base.py  |  6 +--
 test/legacy_test/test_conv2d_api.py           |  2 +-
 test/legacy_test/test_conv3d_layer.py         |  8 ++--
 test/legacy_test/test_conv3d_op.py            |  2 +-
 .../test_conv3d_transpose_layer.py            |  8 ++--
 .../test_fused_attention_op_api.py            |  4 +-
 ...bias_dropout_residual_layer_norm_op_api.py |  4 +-
 test/legacy_test/test_log_softmax.py          |  2 +-
 .../test_paddle_save_load_binary.py           |  2 +-
 test/legacy_test/test_scatter_nd_op.py        |  8 ++--
 test/legacy_test/test_set_value_op.py         | 28 +++++------
 test/legacy_test/test_sgd_op.py               |  4 +-
 test/legacy_test/test_sgd_op_bf16.py          |  2 +-
 test/legacy_test/test_signal.py               |  2 +-
 test/legacy_test/test_slice_scatter.py        |  8 ++--
 test/legacy_test/test_softmax_op.py           |  6 +--
 .../test_softmax_with_cross_entropy_op.py     |  2 +-
 test/legacy_test/test_sparse_attention_op.py  |  6 +--
 test/legacy_test/test_split_op.py             | 24 ++++++----
 test/legacy_test/test_static_pylayer.py       |  2 +-
 test/legacy_test/test_static_save_load.py     | 46 +++++++++----------
 .../legacy_test/test_static_save_load_bf16.py |  4 +-
 .../test_static_save_load_large.py            |  2 +-
 ...tatic_shape_inferrence_for_shape_tensor.py |  2 +-
 test/legacy_test/test_sum_op.py               | 12 ++---
 test/legacy_test/test_svd_op.py               |  2 +-
 test/legacy_test/test_sync_batch_norm_op.py   |  2 +-
 test/legacy_test/test_tensor.py               |  2 +-
 test/legacy_test/test_tensor_register_hook.py |  2 +-
 test/legacy_test/test_tensor_uva.py           |  2 +-
 test/legacy_test/test_traced_layer_err_msg.py |  2 +-
 test/legacy_test/test_trans_layout_op.py      |  8 ++--
 test/legacy_test/test_transformer_api.py      |  4 +-
 test/legacy_test/test_tril_triu_op.py         |  2 +-
 .../test_truncated_gaussian_random_op.py      |  2 +-
 test/legacy_test/test_vision_models.py        |  2 +-
 test/legacy_test/test_viterbi_decode_op.py    |  6 +--
 test/legacy_test/test_warpctc_op.py           |  2 +-
 test/legacy_test/test_where_op.py             |  4 +-
 test/legacy_test/test_while_loop_op.py        |  2 +-
 test/legacy_test/test_while_op.py             |  2 +-
 test/legacy_test/test_zeros_like_op.py        |  2 +-
 test/mkldnn/test_elementwise_add_mkldnn_op.py |  8 ++--
 test/mkldnn/test_fused_vit_attention.py       |  4 +-
 test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py |  2 +-
 55 files changed, 153 insertions(+), 147 deletions(-)

diff --git a/test/legacy_test/test_adadelta_op.py b/test/legacy_test/test_adadelta_op.py
index 9202d6e2aa80f..c7ae043fbac6f 100644
--- a/test/legacy_test/test_adadelta_op.py
+++ b/test/legacy_test/test_adadelta_op.py
@@ -236,7 +236,7 @@ def test_adadelta_dygraph(self):
         adam.clear_gradients()
 
 
-class TestAdadeltaOpMultiPrecison(unittest.TestCase):
+class TestAdadeltaOpMultiPrecision(unittest.TestCase):
     def _test_adadelta_op_dygraph_place_amp(self, place, use_amp=False):
         import paddle
 
diff --git a/test/legacy_test/test_adagrad_op.py b/test/legacy_test/test_adagrad_op.py
index b1aab3c903248..4d356e89d4784 100644
--- a/test/legacy_test/test_adagrad_op.py
+++ b/test/legacy_test/test_adagrad_op.py
@@ -130,7 +130,7 @@ def check_with_place(self, place):
         param_array = np.full((height, row_numel), 5.0).astype("float32")
         param.set(param_array, place)
 
-        # create and initialize LeraningRate Variable
+        # create and initialize LearningRate Variable
         lr = scope.var('LearningRate').get_tensor()
         lr_array = np.full((1), 2.0).astype("float32")
         lr.set(lr_array, place)
@@ -211,7 +211,7 @@ def test_sparse_adagrad(self):
             self.check_with_place(place)
 
 
-class TestAdagradOpMultiPrecison(unittest.TestCase):
+class TestAdagradOpMultiPrecision(unittest.TestCase):
     def _test_adagrad_op_dygraph_place_amp(self, place, use_amp=False):
         import paddle
 
diff --git a/test/legacy_test/test_adam_op.py b/test/legacy_test/test_adam_op.py
index c06e249a874e0..0693d4f664356 100644
--- a/test/legacy_test/test_adam_op.py
+++ b/test/legacy_test/test_adam_op.py
@@ -758,7 +758,7 @@ def test_adam_op_with_state_dict(self):
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
-        # leanrning_rate is Tensor
+        # learning_rate is Tensor
         with self.assertRaises(TypeError):
             learning_rate = np.array([0.01]).astype("float32")
             learning_rate = paddle.to_tensor(learning_rate)
diff --git a/test/legacy_test/test_adamax_op.py b/test/legacy_test/test_adamax_op.py
index 4087e75398266..48549d6275a9f 100644
--- a/test/legacy_test/test_adamax_op.py
+++ b/test/legacy_test/test_adamax_op.py
@@ -239,7 +239,7 @@ def test_adamax_op_invalid_input(self):
             )
 
 
-class TestAdamaxOpMultiPrecison(unittest.TestCase):
+class TestAdamaxOpMultiPrecision(unittest.TestCase):
     def _test_adamax_op_dygraph_place_amp(self, place, use_amp=False):
         import paddle
 
diff --git a/test/legacy_test/test_adamw_op.py b/test/legacy_test/test_adamw_op.py
index 752e8076d3b14..1c901e8d4baf5 100644
--- a/test/legacy_test/test_adamw_op.py
+++ b/test/legacy_test/test_adamw_op.py
@@ -404,7 +404,7 @@ def test_adamw_op_dygraph_bypassing_step(self):
             adam.clear_gradients()
 
 
-class TestAdamWOpMultiPrecisonWithMainGrad(unittest.TestCase):
+class TestAdamWOpMultiPrecisionWithMainGrad(unittest.TestCase):
     def _test_adamw_op_dygraph_place_amp_with_maingrad(
         self, place, shape, use_main_grad
     ):
@@ -543,7 +543,7 @@ def test_main(self):
                     )
 
 
-class TestAdamWOpMultiPrecison(unittest.TestCase):
+class TestAdamWOpMultiPrecision(unittest.TestCase):
     def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
         paddle.disable_static()
         paddle.seed(10)
diff --git a/test/legacy_test/test_add_n_op.py b/test/legacy_test/test_add_n_op.py
index e543b4f05c74b..60981b9cd02f0 100644
--- a/test/legacy_test/test_add_n_op.py
+++ b/test/legacy_test/test_add_n_op.py
@@ -70,15 +70,15 @@ def test_add_n_api(self):
         if not paddle.is_compiled_with_cuda():
             return
         dtypes = ['float32', 'complex64', 'complex128']
-        for dtyte in dtypes:
-            if dtyte == 'complex64' or dtyte == 'complex128':
+        for dtype in dtypes:
+            if dtype == 'complex64' or dtype == 'complex128':
                 self.x_np = (
                     np.random.random([self.l, 16, 256])
                     + 1j * np.random.random([self.l, 16, 256])
-                ).astype(dtyte)
+                ).astype(dtype)
 
-            y_np_32, x_g_np_32 = self.check_main(self.x_np, dtyte)
-            y_np_gt = np.sum(self.x_np, axis=0).astype(dtyte)
+            y_np_32, x_g_np_32 = self.check_main(self.x_np, dtype)
+            y_np_gt = np.sum(self.x_np, axis=0).astype(dtype)
             np.testing.assert_allclose(y_np_32, y_np_gt, rtol=1e-06)
 
 
diff --git a/test/legacy_test/test_arange.py b/test/legacy_test/test_arange.py
index bb4bb0f430b68..fbcc6eb421da5 100644
--- a/test/legacy_test/test_arange.py
+++ b/test/legacy_test/test_arange.py
@@ -59,7 +59,7 @@ def init_config(self):
         self.case = (0, 5, 1)
 
 
-class TestFloa16ArangeOp(TestArangeOp):
+class TestFloat16ArangeOp(TestArangeOp):
     def init_config(self):
         self.dtype = np.float16
         self.python_api = paddle.arange
diff --git a/test/legacy_test/test_backward.py b/test/legacy_test/test_backward.py
index 2ae9ede04987a..04aeadc038213 100644
--- a/test/legacy_test/test_backward.py
+++ b/test/legacy_test/test_backward.py
@@ -378,12 +378,12 @@ def callback(block, context):
 
 
 class TestGradientsWithOptimizer(unittest.TestCase):
-    def _check_grad_op_name(self, forward_list, optimiezed_list):
+    def _check_grad_op_name(self, forward_list, optimized_list):
         backward_list = [op + "_grad" for op in reversed(forward_list)]
-        idx = optimiezed_list.index(backward_list[0], len(backward_list))
+        idx = optimized_list.index(backward_list[0], len(backward_list))
 
         self.assertListEqual(
-            backward_list, optimiezed_list[idx : idx + len(backward_list)]
+            backward_list, optimized_list[idx : idx + len(backward_list)]
         )
 
     def test_gradient_with_optimizer(self):
diff --git a/test/legacy_test/test_bicubic_interp_v2_op.py b/test/legacy_test/test_bicubic_interp_v2_op.py
index 86b998c635648..d2d0092f1e9f6 100644
--- a/test/legacy_test/test_bicubic_interp_v2_op.py
+++ b/test/legacy_test/test_bicubic_interp_v2_op.py
@@ -674,7 +674,7 @@ def test_case(self):
 
 class TestBicubicOpError(unittest.TestCase):
     def test_imperative_errors(self):
-        # the input of interpoalte must be Variable.
+        # the input of interpolate must be Variable.
         x1 = base.create_lod_tensor(
             np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], base.CPUPlace()
         )
@@ -687,7 +687,7 @@ def test_mode_type():
             )
 
             out = interpolate(
-                x, size=[12, 12], mode='UNKONWN', align_corners=False
+                x, size=[12, 12], mode='UNKNOWN', align_corners=False
             )
 
         def test_input_shape():
@@ -696,7 +696,7 @@ def test_input_shape():
                 x, size=[12, 12], mode='BICUBIC', align_corners=False
             )
 
-        def test_align_corcers():
+        def test_align_corners():
             x = paddle.static.data(
                 name="x", shape=[2, 3, 6, 6], dtype="float32"
             )
@@ -887,7 +887,7 @@ def test_input_shape_1():
 
         self.assertRaises(ValueError, test_mode_type)
         self.assertRaises(ValueError, test_input_shape)
-        self.assertRaises(TypeError, test_align_corcers)
+        self.assertRaises(TypeError, test_align_corners)
         self.assertRaises(ValueError, test_attr_data_format)
         self.assertRaises(TypeError, test_actual_shape)
         self.assertRaises(ValueError, test_scale_value)
diff --git a/test/legacy_test/test_checkpoint_saver.py b/test/legacy_test/test_checkpoint_saver.py
index 643ea78816579..0390d4f8c60f0 100644
--- a/test/legacy_test/test_checkpoint_saver.py
+++ b/test/legacy_test/test_checkpoint_saver.py
@@ -18,7 +18,7 @@
 from paddle.distributed.fleet.utils.fs import HDFSClient
 
 
-class CheckpointerSaverTest(unittest.TestCase):
+class CheckpointSaverTest(unittest.TestCase):
     def test(self):
         fs = HDFSClient("/usr/local/hadoop-2.7.7", None)
         dir_path = "./checkpointsaver_test"
diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py
index 8f6a382297a1f..f71b524344aec 100644
--- a/test/legacy_test/test_collective_api_base.py
+++ b/test/legacy_test/test_collective_api_base.py
@@ -611,19 +611,19 @@ def convertbf16(origin):
             result1 = []
             result2 = []
 
-            def is_empyt_list(x):
+            def is_empty_list(x):
                 if isinstance(x, list) and len(x) == 0:
                     return True
                 return False
 
             for i in range(tot_expert):
                 for arr in output1[i]:
-                    if is_empyt_list(arr):
+                    if is_empty_list(arr):
                         continue
                     result1.append(arr)
             for i in range(tot_expert):
                 for arr in output2[i]:
-                    if is_empyt_list(arr):
+                    if is_empty_list(arr):
                         continue
                     result2.append(arr)
 
diff --git a/test/legacy_test/test_conv2d_api.py b/test/legacy_test/test_conv2d_api.py
index 9d2398a5782ca..433dafbcd7fed 100644
--- a/test/legacy_test/test_conv2d_api.py
+++ b/test/legacy_test/test_conv2d_api.py
@@ -201,7 +201,7 @@ def run_5():
 
         self.assertRaises(ValueError, run_5)
 
-        # ValueError: channel dimmention
+        # ValueError: channel dimension
         x = paddle.static.data(
             name="x",
             shape=[2, 5, 5, -1],
diff --git a/test/legacy_test/test_conv3d_layer.py b/test/legacy_test/test_conv3d_layer.py
index d514f56c2631a..55c4c569568aa 100644
--- a/test/legacy_test/test_conv3d_layer.py
+++ b/test/legacy_test/test_conv3d_layer.py
@@ -27,7 +27,7 @@ def __init__(
         self,
         methodName='runTest',
         batch_size=4,
-        spartial_shape=(8, 8, 8),
+        spatial_shape=(8, 8, 8),
         num_channels=6,
         num_filters=8,
         filter_size=3,
@@ -43,7 +43,7 @@ def __init__(
         self.batch_size = batch_size
         self.num_channels = num_channels
         self.num_filters = num_filters
-        self.spartial_shape = spartial_shape
+        self.spatial_shape = spatial_shape
         self.filter_size = filter_size
 
         self.padding = padding
@@ -58,13 +58,13 @@ def setUp(self):
         self.channel_last = self.data_format == "NDHWC"
         if self.channel_last:
             input_shape = (
-                (self.batch_size,) + self.spartial_shape + (self.num_channels,)
+                (self.batch_size,) + self.spatial_shape + (self.num_channels,)
             )
         else:
             input_shape = (
                 self.batch_size,
                 self.num_channels,
-            ) + self.spartial_shape
+            ) + self.spatial_shape
         self.input = np.random.randn(*input_shape).astype(self.dtype)
 
         if isinstance(self.filter_size, int):
diff --git a/test/legacy_test/test_conv3d_op.py b/test/legacy_test/test_conv3d_op.py
index cfa39de922075..cd0d6449020ca 100644
--- a/test/legacy_test/test_conv3d_op.py
+++ b/test/legacy_test/test_conv3d_op.py
@@ -1212,7 +1212,7 @@ def run_5():
 
             self.assertRaises(ValueError, run_5)
 
-            # ValueError: channel dimmention
+            # ValueError: channel dimension
             x = paddle.static.data(
                 name="x",
                 shape=[2, 5, 5, 5, -1],
diff --git a/test/legacy_test/test_conv3d_transpose_layer.py b/test/legacy_test/test_conv3d_transpose_layer.py
index 7624253ba6031..910513f6b4176 100644
--- a/test/legacy_test/test_conv3d_transpose_layer.py
+++ b/test/legacy_test/test_conv3d_transpose_layer.py
@@ -27,7 +27,7 @@ def __init__(
         self,
         methodName='runTest',
         batch_size=2,
-        spartial_shape=(8, 8, 8),
+        spatial_shape=(8, 8, 8),
         num_channels=6,
         num_filters=8,
         filter_size=3,
@@ -44,7 +44,7 @@ def __init__(
         self.batch_size = batch_size
         self.num_channels = num_channels
         self.num_filters = num_filters
-        self.spartial_shape = spartial_shape
+        self.spatial_shape = spatial_shape
         self.filter_size = filter_size
         self.output_size = output_size
 
@@ -60,13 +60,13 @@ def setUp(self):
         self.channel_last = self.data_format == "NDHWC"
         if self.channel_last:
             input_shape = (
-                (self.batch_size,) + self.spartial_shape + (self.num_channels,)
+                (self.batch_size,) + self.spatial_shape + (self.num_channels,)
             )
         else:
             input_shape = (
                 self.batch_size,
                 self.num_channels,
-            ) + self.spartial_shape
+            ) + self.spatial_shape
         self.input = np.random.randn(*input_shape).astype(self.dtype)
 
         if isinstance(self.filter_size, int):
diff --git a/test/legacy_test/test_fused_attention_op_api.py b/test/legacy_test/test_fused_attention_op_api.py
index 1570c0b0dd733..e3ea9491e3782 100644
--- a/test/legacy_test/test_fused_attention_op_api.py
+++ b/test/legacy_test/test_fused_attention_op_api.py
@@ -53,9 +53,9 @@ def layer_norm(x, has_scale, has_bias, weight, bias, epsilon=1e-05):
     batch_size, src_len, d_model = x.shape
     x = x.reshape((batch_size * src_len, d_model))
     mu = np.mean(x, axis=1, keepdims=True)
-    sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+    sigma_square = np.sum(np.square(x - mu), axis=1) / d_model
     x1_up = x - mu
-    x1_down_1 = sigma_squar + epsilon
+    x1_down_1 = sigma_square + epsilon
     x1_down = np.sqrt(x1_down_1)
     x1_down = x1_down.reshape((x1_down.shape[0], 1))
     x1 = x1_up / x1_down
diff --git a/test/legacy_test/test_fused_bias_dropout_residual_layer_norm_op_api.py b/test/legacy_test/test_fused_bias_dropout_residual_layer_norm_op_api.py
index dae2f2ba61c88..9efa1cd354cb3 100644
--- a/test/legacy_test/test_fused_bias_dropout_residual_layer_norm_op_api.py
+++ b/test/legacy_test/test_fused_bias_dropout_residual_layer_norm_op_api.py
@@ -27,9 +27,9 @@ def layer_norm(x, has_scale, has_bias, weight, bias, epsilon=1e-05):
     batch_size, src_len, d_model = x.shape
     x = x.reshape((batch_size * src_len, d_model))
     mu = np.mean(x, axis=1, keepdims=True)
-    sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+    sigma_square = np.sum(np.square(x - mu), axis=1) / d_model
     x1_up = x - mu
-    x1_down_1 = sigma_squar + epsilon
+    x1_down_1 = sigma_square + epsilon
     x1_down = np.sqrt(x1_down_1)
     x1_down = x1_down.reshape((x1_down.shape[0], 1))
     x1 = x1_up / x1_down
diff --git a/test/legacy_test/test_log_softmax.py b/test/legacy_test/test_log_softmax.py
index 64be97bb1aba1..494ba127032e4 100644
--- a/test/legacy_test/test_log_softmax.py
+++ b/test/legacy_test/test_log_softmax.py
@@ -194,7 +194,7 @@ def check_api(self, axis=-1):
             out = exe.run(feed={'x': self.x}, fetch_list=[y])
         np.testing.assert_allclose(out[0], ref_out, rtol=1e-05)
 
-        # test dygrapg api
+        # test dygraph api
         paddle.disable_static()
         x = paddle.to_tensor(self.x)
         y = logsoftmax(x)
diff --git a/test/legacy_test/test_paddle_save_load_binary.py b/test/legacy_test/test_paddle_save_load_binary.py
index df7304cf1d19e..22b62e082cc94 100644
--- a/test/legacy_test/test_paddle_save_load_binary.py
+++ b/test/legacy_test/test_paddle_save_load_binary.py
@@ -94,7 +94,7 @@ def test_replace_save_load_vars(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
             # test for replace_save_vars/io.load_vars
diff --git a/test/legacy_test/test_scatter_nd_op.py b/test/legacy_test/test_scatter_nd_op.py
index 4936ccdb8989f..531b865a7c50b 100644
--- a/test/legacy_test/test_scatter_nd_op.py
+++ b/test/legacy_test/test_scatter_nd_op.py
@@ -29,16 +29,16 @@ def numpy_scatter_nd(ref, index, updates, fun):
     index_shape = index.shape
 
     end_size = index_shape[-1]
-    remain_numl = 1
+    remain_numel = 1
     for i in range(len(index_shape) - 1):
-        remain_numl *= index_shape[i]
+        remain_numel *= index_shape[i]
 
     slice_size = 1
     for i in range(end_size, len(ref_shape)):
         slice_size *= ref_shape[i]
 
-    flat_index = index.reshape([remain_numl] + list(index_shape[-1:]))
-    flat_updates = updates.reshape((remain_numl, slice_size))
+    flat_index = index.reshape([remain_numel] + list(index_shape[-1:]))
+    flat_updates = updates.reshape((remain_numel, slice_size))
     flat_output = ref.reshape(list(ref_shape[:end_size]) + [slice_size])
 
     for i_up, i_out in enumerate(flat_index):
diff --git a/test/legacy_test/test_set_value_op.py b/test/legacy_test/test_set_value_op.py
index c289185e58d21..4113805c663b4 100644
--- a/test/legacy_test/test_set_value_op.py
+++ b/test/legacy_test/test_set_value_op.py
@@ -265,7 +265,7 @@ def _get_answer(self):
 
 
 # 1.2.3 step < 0
-class TestSetValueItemSliceNegetiveStep(TestSetValueApi):
+class TestSetValueItemSliceNegativeStep(TestSetValueApi):
     def set_shape(self):
         self.shape = [5, 2]
 
@@ -283,7 +283,7 @@ def _get_answer(self):
         self.data[5:2:-1] = self.value
 
 
-class TestSetValueItemSliceNegetiveStep2(TestSetValueApi):
+class TestSetValueItemSliceNegativeStep2(TestSetValueApi):
     def set_shape(self):
         self.shape = [5]
 
@@ -301,7 +301,7 @@ def _get_answer(self):
         self.data[1::-1] = self.value
 
 
-class TestSetValueItemSliceNegetiveStep3(TestSetValueApi):
+class TestSetValueItemSliceNegativeStep3(TestSetValueApi):
     def set_shape(self):
         self.shape = [3]
 
@@ -319,7 +319,7 @@ def _get_answer(self):
         self.data[::-1] = self.value
 
 
-class TestSetValueItemSliceNegetiveStep4(TestSetValueApi):
+class TestSetValueItemSliceNegativeStep4(TestSetValueApi):
     def set_shape(self):
         self.shape = [3, 4, 5]
 
@@ -1504,14 +1504,14 @@ def set_value(t, value):
         np.testing.assert_array_equal(
             inps.grad.numpy(),
             input_grad,
-            err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of value should be \n{},\n but received {}'.format(
                 input_grad, inps.grad.numpy()
             ),
         )
         np.testing.assert_array_equal(
             value.grad.numpy(),
             value_grad,
-            err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of input should be \n{},\n but received {}'.format(
                 value_grad, value.grad.numpy()
             ),
         )
@@ -1538,14 +1538,14 @@ def set_value(t, value):
         np.testing.assert_array_equal(
             inps2.grad.numpy(),
             input_grad2,
-            err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of value should be \n{},\n but received {}'.format(
                 input_grad, inps2.grad.numpy()
             ),
         )
         np.testing.assert_array_equal(
             value2.grad.numpy(),
             value_grad2,
-            err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of input should be \n{},\n but received {}'.format(
                 value_grad, value2.grad.numpy()
             ),
         )
@@ -1592,14 +1592,14 @@ def set_value3(t, value):
         np.testing.assert_array_equal(
             inps.grad.numpy(),
             input_grad,
-            err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of value should be \n{},\n but received {}'.format(
                 input_grad, inps.grad.numpy()
             ),
         )
         np.testing.assert_array_equal(
             value.grad.numpy(),
             value_grad,
-            err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of input should be \n{},\n but received {}'.format(
                 value_grad, value.grad.numpy()
             ),
         )
@@ -1640,14 +1640,14 @@ def set_value4(t, value):
         np.testing.assert_array_equal(
             inps.grad.numpy(),
             input_grad,
-            err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of value should be \n{},\n but received {}'.format(
                 input_grad, inps.grad.numpy()
             ),
         )
         np.testing.assert_array_equal(
             value.grad.numpy(),
             value_grad,
-            err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of input should be \n{},\n but received {}'.format(
                 value_grad, value.grad.numpy()
             ),
         )
@@ -1692,14 +1692,14 @@ def set_value5(t, value):
         np.testing.assert_array_equal(
             inps.grad.numpy(),
             input_grad,
-            err_msg='The gradient of value should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of value should be \n{},\n but received {}'.format(
                 input_grad, inps.grad.numpy()
             ),
         )
         np.testing.assert_array_equal(
             value.grad.numpy(),
             value_grad,
-            err_msg='The gradient of input should be \n{},\n but reveived {}'.format(
+            err_msg='The gradient of input should be \n{},\n but received {}'.format(
                 value_grad, value.grad.numpy()
             ),
         )
diff --git a/test/legacy_test/test_sgd_op.py b/test/legacy_test/test_sgd_op.py
index d71b297185892..ba7dbb99d1b87 100644
--- a/test/legacy_test/test_sgd_op.py
+++ b/test/legacy_test/test_sgd_op.py
@@ -85,7 +85,7 @@ def check_with_place(self, place):
         param_array = np.full((height, self.row_numel), 5.0).astype("float32")
         param.set(param_array, place)
 
-        # create and initialize LeraningRate Variable
+        # create and initialize LearningRate Variable
         lr = scope.var('LearningRate').get_tensor()
         lr_array = np.full((1), 2.0).astype("float32")
         lr.set(lr_array, place)
@@ -170,7 +170,7 @@ def check_with_place(self, place):
 
         w_before_optimize = np.array(w_tensor)
 
-        # create and initialize LeraningRate Variable
+        # create and initialize LearningRate Variable
         lr_value = 0.1
         lr = scope.var('LearningRate').get_tensor()
         lr_array = np.full((1), lr_value).astype("float32")
diff --git a/test/legacy_test/test_sgd_op_bf16.py b/test/legacy_test/test_sgd_op_bf16.py
index 3baf0a490cbf5..3af7d9c6bc93a 100644
--- a/test/legacy_test/test_sgd_op_bf16.py
+++ b/test/legacy_test/test_sgd_op_bf16.py
@@ -286,7 +286,7 @@ def _reference(self, data, emb_weight, bf16=False):
         out_dtype = np.uint16 if bf16 else np.float32
         lookup_table_grad = np.zeros(self.w_shape, dtype=out_dtype)
 
-        # indexes may dupplicate
+        # indexes may duplicate
         if bf16:
             for i, idx in enumerate(data):
                 idxv = idx[0]
diff --git a/test/legacy_test/test_signal.py b/test/legacy_test/test_signal.py
index 1d86f15f51095..8d70702a26ad6 100644
--- a/test/legacy_test/test_signal.py
+++ b/test/legacy_test/test_signal.py
@@ -574,7 +574,7 @@ def decorate(cls):
 def setUpModule():
     global rtol
     global atol
-    # All test case will use float64 for compare percision, refs:
+    # All test case will use float64 for compare precision, refs:
     # https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64
     rtol = {
         'float32': 1e-06,
diff --git a/test/legacy_test/test_slice_scatter.py b/test/legacy_test/test_slice_scatter.py
index 075b5a5741886..bb46ddffb9b94 100644
--- a/test/legacy_test/test_slice_scatter.py
+++ b/test/legacy_test/test_slice_scatter.py
@@ -264,7 +264,7 @@ def init_dtype(self):
         self.dtype = 'float32'
 
 
-class TestSliceScatterApiBroadcase2D(TestSliceScatterApi):
+class TestSliceScatterApiBroadcast2D(TestSliceScatterApi):
     def init_shape(self):
         self.x_shape = [8, 9]
         self.value_shape = [8, 1]
@@ -274,12 +274,12 @@ def init_shape(self):
         self.strides = [2]
 
 
-class TestSliceScatterApiBroadcase2DFloat32(TestSliceScatterApiBroadcase2D):
+class TestSliceScatterApiBroadcast2DFloat32(TestSliceScatterApiBroadcast2D):
     def init_dtype(self):
         self.dtype = 'float32'
 
 
-class TestSliceScatterApiBroadcase3D(TestSliceScatterApi):
+class TestSliceScatterApiBroadcast3D(TestSliceScatterApi):
     def init_shape(self):
         self.x_shape = [8, 9, 6]
         self.value_shape = [1, 9, 1]
@@ -289,7 +289,7 @@ def init_shape(self):
         self.strides = [3, 2]
 
 
-class TestSliceScatterApiBroadcase3DFloat32(TestSliceScatterApiBroadcase3D):
+class TestSliceScatterApiBroadcast3DFloat32(TestSliceScatterApiBroadcast3D):
     def init_dtype(self):
         self.dtype = 'float32'
 
diff --git a/test/legacy_test/test_softmax_op.py b/test/legacy_test/test_softmax_op.py
index 6b9952e569ae5..1876424cf4d4b 100644
--- a/test/legacy_test/test_softmax_op.py
+++ b/test/legacy_test/test_softmax_op.py
@@ -59,7 +59,7 @@ def setUp(self):
         self.public_python_api = F.softmax
         self.use_cudnn = False
         self.use_mkldnn = False
-        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        # explicitly use float32 for ROCm, as MIOpen does not yet support float64
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.init_kernel_type()
         self.shape = self.get_x_shape()
@@ -142,7 +142,7 @@ def setUp(self):
         self.public_python_api = F.softmax
         self.use_cudnn = False
         self.use_mkldnn = False
-        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        # explicitly use float32 for ROCm, as MIOpen does not yet support float64
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.init_kernel_type()
 
@@ -190,7 +190,7 @@ def setUp(self):
         self.prim_op_type = "comp"
         self.use_cudnn = True
         self.use_mkldnn = False
-        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        # explicitly use float32 for ROCm, as MIOpen does not yet support float64
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
         np.random.seed(0)
diff --git a/test/legacy_test/test_softmax_with_cross_entropy_op.py b/test/legacy_test/test_softmax_with_cross_entropy_op.py
index 62f475cd922a7..8bafae13efc70 100644
--- a/test/legacy_test/test_softmax_with_cross_entropy_op.py
+++ b/test/legacy_test/test_softmax_with_cross_entropy_op.py
@@ -99,7 +99,7 @@ def initParams(self):
         self.python_out_sig = ["Loss", "Softmax"]
         self.numeric_stable_mode = False
         self.soft_label = False
-        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
+        # explicitly use float32 for ROCm, as MIOpen does not yet support float64
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.axis = -1
         self.ignore_index = -1
diff --git a/test/legacy_test/test_sparse_attention_op.py b/test/legacy_test/test_sparse_attention_op.py
index 48946522864aa..24272c5e416b0 100644
--- a/test/legacy_test/test_sparse_attention_op.py
+++ b/test/legacy_test/test_sparse_attention_op.py
@@ -438,7 +438,7 @@ def test_dygraph(self):
         paddle_key = paddle.to_tensor(key, place=self.place)
         paddle_value = paddle.to_tensor(value, place=self.place)
         paddle_offset = paddle.to_tensor(offset, place=self.place)
-        paddle_colunmns = paddle.to_tensor(columns, place=self.place)
+        paddle_columns = paddle.to_tensor(columns, place=self.place)
         paddle_kp_mask = paddle.to_tensor(key_padding_mask, place=self.place)
         paddle_attn_mask = paddle.to_tensor(attn_mask, place=self.place)
 
@@ -448,7 +448,7 @@ def test_dygraph(self):
                 paddle_key,
                 paddle_value,
                 paddle_offset,
-                paddle_colunmns,
+                paddle_columns,
                 key_padding_mask=paddle_kp_mask,
                 attn_mask=paddle_attn_mask,
             )
@@ -469,7 +469,7 @@ def test_dygraph(self):
                 paddle_key,
                 paddle_value,
                 paddle_offset,
-                paddle_colunmns,
+                paddle_columns,
             )
 
             numpy_result, __, __ = ref_batch_sparse_attention(
diff --git a/test/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py
index 9311a5f2d9957..0f780b2ddfff6 100644
--- a/test/legacy_test/test_split_op.py
+++ b/test/legacy_test/test_split_op.py
@@ -567,9 +567,11 @@ def test_out1(self):
             eager_x2_out = x2.numpy()
             loss = x0.sum()
             loss.backward()
-            manul_grad = np.zeros_like(input_1)
-            manul_grad[:, :2, :] = 1
-            np.testing.assert_allclose(input.gradient(), manul_grad, rtol=1e-05)
+            manual_grad = np.zeros_like(input_1)
+            manual_grad[:, :2, :] = 1
+            np.testing.assert_allclose(
+                input.gradient(), manual_grad, rtol=1e-05
+            )
             np.testing.assert_allclose(ex_x0, eager_x0_out, rtol=1e-05)
             np.testing.assert_allclose(ex_x1, eager_x1_out, rtol=1e-05)
             np.testing.assert_allclose(ex_x2, eager_x2_out, rtol=1e-05)
@@ -597,9 +599,11 @@ def test_out2(self):
             eager_x2_out = x2.numpy()
             loss = x0.sum()
             loss.backward()
-            manul_grad = np.zeros_like(input_1)
-            manul_grad[:, :2, :] = 1
-            np.testing.assert_allclose(input.gradient(), manul_grad, rtol=1e-05)
+            manual_grad = np.zeros_like(input_1)
+            manual_grad[:, :2, :] = 1
+            np.testing.assert_allclose(
+                input.gradient(), manual_grad, rtol=1e-05
+            )
             np.testing.assert_allclose(ex_x0, eager_x0_out, rtol=1e-05)
             np.testing.assert_allclose(ex_x1, eager_x1_out, rtol=1e-05)
             np.testing.assert_allclose(ex_x2, eager_x2_out, rtol=1e-05)
@@ -630,9 +634,11 @@ def test_out1(self):
             eager_x2_out = x2.numpy()
             loss = x0.sum()
             loss.backward()
-            manul_grad = np.zeros_like(input_1)
-            manul_grad[:, :2, :] = 1
-            np.testing.assert_allclose(input.gradient(), manul_grad, rtol=1e-05)
+            manual_grad = np.zeros_like(input_1)
+            manual_grad[:, :2, :] = 1
+            np.testing.assert_allclose(
+                input.gradient(), manual_grad, rtol=1e-05
+            )
             np.testing.assert_allclose(ex_x0, eager_x0_out, rtol=1e-05)
             np.testing.assert_allclose(ex_x1, eager_x1_out, rtol=1e-05)
             np.testing.assert_allclose(ex_x2, eager_x2_out, rtol=1e-05)
diff --git a/test/legacy_test/test_static_pylayer.py b/test/legacy_test/test_static_pylayer.py
index fd5075a4904aa..34a5afe577a67 100644
--- a/test/legacy_test/test_static_pylayer.py
+++ b/test/legacy_test/test_static_pylayer.py
@@ -133,7 +133,7 @@ def backward_fn(dy):
         self.assertEqual(x_grad.shape, ())
 
     @test_with_pir_api
-    def test_return_var_typle(self):
+    def test_return_var_type(self):
         def forward_fn(a, b):
             return 3 * a, -2 * b
 
diff --git a/test/legacy_test/test_static_save_load.py b/test/legacy_test/test_static_save_load.py
index ca1adaed4ef0a..f662ee3f95e69 100644
--- a/test/legacy_test/test_static_save_load.py
+++ b/test/legacy_test/test_static_save_load.py
@@ -340,7 +340,7 @@ def test_ptb_rnn_cpu_float32(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -357,7 +357,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.load(
@@ -480,7 +480,7 @@ def test_ptb_rnn_cpu_float32(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -497,7 +497,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.load(
@@ -613,7 +613,7 @@ def test_ptb_rnn_cpu_float32(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -630,7 +630,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.load(
@@ -751,7 +751,7 @@ def test_ptb_rnn_cpu_float32(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -768,7 +768,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             # base.load(test_program, "./test_1", None )
@@ -807,7 +807,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.set_program_state(test_program, program_state_1)
@@ -829,7 +829,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.set_program_state(test_program, program_state_2)
@@ -851,7 +851,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.set_program_state(test_program, program_state_3)
@@ -954,7 +954,7 @@ def set_var(var, ndarray):
                 t = np.array(
                     base.global_scope().find_var(var.name).get_tensor()
                 )
-                # make sure all the paramerter or optimizer var have been update
+                # make sure all the parameter or optimizer var have been update
                 base_map[var.name] = t
 
         for var in program.list_vars():
@@ -1073,7 +1073,7 @@ def test_load_from_old_interface(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1091,7 +1091,7 @@ def test_load_from_old_interface(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.load(
@@ -1214,7 +1214,7 @@ def test_load_from_old_interface_var_list(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1237,7 +1237,7 @@ def test_load_from_old_interface_var_list(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.load(
@@ -1355,7 +1355,7 @@ def test_load_from_old_interface(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
             save_dir = os.path.join(temp_dir.name, "test_path")
@@ -1373,7 +1373,7 @@ def test_load_from_old_interface(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             file_model_path = os.path.join(save_dir, "model_single")
@@ -1556,7 +1556,7 @@ def test_ptb_rnn_cpu_float32(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
             save_dir = os.path.join(self.temp_dir.name, "test_program_1")
@@ -1571,7 +1571,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             # case 1: load basic
@@ -1731,7 +1731,7 @@ def test_ptb_rnn_cpu_float32(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
@@ -1749,7 +1749,7 @@ def test_ptb_rnn_cpu_float32(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             # base.load(test_program, "./test_1", None )
@@ -1816,7 +1816,7 @@ def test_pickle_protocol(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
diff --git a/test/legacy_test/test_static_save_load_bf16.py b/test/legacy_test/test_static_save_load_bf16.py
index 1ca1bec7d15e7..bc91f34b3f60c 100644
--- a/test/legacy_test/test_static_save_load_bf16.py
+++ b/test/legacy_test/test_static_save_load_bf16.py
@@ -127,7 +127,7 @@ def test_ptb_rnn_cpu_bfloat16(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
             save_dir = os.path.join(self.temp_dir.name, "test_1")
@@ -142,7 +142,7 @@ def test_ptb_rnn_cpu_bfloat16(self):
                     new_t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been set to zero
+                    # make sure all the parameter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             paddle.static.load(
diff --git a/test/legacy_test/test_static_save_load_large.py b/test/legacy_test/test_static_save_load_large.py
index 2011dd45cfaf8..d0ef10ae3a9ab 100644
--- a/test/legacy_test/test_static_save_load_large.py
+++ b/test/legacy_test/test_static_save_load_large.py
@@ -49,7 +49,7 @@ def test_large_parameters_static_save(self):
                     t = np.array(
                         base.global_scope().find_var(var.name).get_tensor()
                     )
-                    # make sure all the paramerter or optimizer var have been update
+                    # make sure all the parameter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
             temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/legacy_test/test_static_shape_inferrence_for_shape_tensor.py b/test/legacy_test/test_static_shape_inferrence_for_shape_tensor.py
index a38fd88881937..eaa910637bbd2 100644
--- a/test/legacy_test/test_static_shape_inferrence_for_shape_tensor.py
+++ b/test/legacy_test/test_static_shape_inferrence_for_shape_tensor.py
@@ -17,7 +17,7 @@
 import paddle
 
 
-class StaticShapeInferrenceTest(unittest.TestCase):
+class StaticShapeInferenceTest(unittest.TestCase):
     def test_static_graph(self):
         paddle.enable_static()
         data = paddle.static.data(name="x", shape=[-1, 2], dtype='float32')
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index cd2f9a4f6cdec..a2570a566c348 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -86,16 +86,16 @@ def setUp(self):
         self.init_kernel_type()
 
     def check_with_place(self, place, inplace):
-        self.check_input_and_optput(
+        self.check_input_and_output(
             core.Scope(), place, inplace, True, True, True
         )
-        self.check_input_and_optput(
+        self.check_input_and_output(
             core.Scope(), place, inplace, False, True, True
         )
-        self.check_input_and_optput(
+        self.check_input_and_output(
             core.Scope(), place, inplace, False, False, True
         )
-        self.check_input_and_optput(
+        self.check_input_and_output(
             core.Scope(), place, inplace, False, False, False
         )
 
@@ -108,7 +108,7 @@ def _get_array(self, rows, row_numel):
             array[i] *= rows[i]
         return array
 
-    def check_input_and_optput(
+    def check_input_and_output(
         self,
         scope,
         place,
@@ -198,7 +198,7 @@ def _get_array(self, rows, row_numel):
         else:
             return np.ndarray((0, row_numel), dtype=self.dtype)
 
-    def check_input_and_optput(
+    def check_input_and_output(
         self,
         scope,
         place,
diff --git a/test/legacy_test/test_svd_op.py b/test/legacy_test/test_svd_op.py
index a74404e408524..6b41fa7cead8d 100644
--- a/test/legacy_test/test_svd_op.py
+++ b/test/legacy_test/test_svd_op.py
@@ -90,7 +90,7 @@ def test_check_grad(self):
 
 
 class TestSvdCheckGrad2(TestSvdOp):
-    # NOTE(xiongkun03): because we want to construct some full rank matrics,
+    # NOTE(xiongkun03): because we want to construct some full rank matrices,
     #                   so we can't specifize matrices which numel() > 100
 
     no_need_check_grad = True
diff --git a/test/legacy_test/test_sync_batch_norm_op.py b/test/legacy_test/test_sync_batch_norm_op.py
index 17daa24996b4f..3b7ea63cb5963 100644
--- a/test/legacy_test/test_sync_batch_norm_op.py
+++ b/test/legacy_test/test_sync_batch_norm_op.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-test for sync bachnorm op.
+test for sync batchnorm op.
 for both FP64 and FP16 input.
 """
 
diff --git a/test/legacy_test/test_tensor.py b/test/legacy_test/test_tensor.py
index 8c007d4675390..9207d3a181789 100644
--- a/test/legacy_test/test_tensor.py
+++ b/test/legacy_test/test_tensor.py
@@ -339,7 +339,7 @@ def test_print_tensor(self):
             print(tensor)
             self.assertTrue(isinstance(str(tensor), str))
 
-    def test_tensor_poiter(self):
+    def test_tensor_pointer(self):
         place = core.CPUPlace()
         scope = core.Scope()
         var = scope.var("test_tensor")
diff --git a/test/legacy_test/test_tensor_register_hook.py b/test/legacy_test/test_tensor_register_hook.py
index 29c614713dff3..c7826c983adcd 100644
--- a/test/legacy_test/test_tensor_register_hook.py
+++ b/test/legacy_test/test_tensor_register_hook.py
@@ -589,7 +589,7 @@ def test_register_backward_hook_for_var_without_gradient(self):
             x._register_backward_hook(global_void_hook)
 
 
-class TestRegsiterBackwardFinalHook(unittest.TestCase):
+class TestRegisterBackwardFinalHook(unittest.TestCase):
     def setUp(self):
         self.devices = ["cpu"]
         if paddle.is_compiled_with_cuda():
diff --git a/test/legacy_test/test_tensor_uva.py b/test/legacy_test/test_tensor_uva.py
index 34d7e59609e0b..e7b6d03fe8bd9 100644
--- a/test/legacy_test/test_tensor_uva.py
+++ b/test/legacy_test/test_tensor_uva.py
@@ -53,7 +53,7 @@ def test_uva_tensor_creation(self):
                 np.testing.assert_allclose(tensor.numpy(), data, rtol=1e-05)
                 np.testing.assert_allclose(tensor2.numpy(), data, rtol=1e-05)
 
-    def test_uva_tensor_corectness(self):
+    def test_uva_tensor_correctness(self):
         if paddle.base.core.is_compiled_with_cuda():
             a = np.arange(0, 100, dtype="int32")
             a = a.reshape([10, 10])
diff --git a/test/legacy_test/test_traced_layer_err_msg.py b/test/legacy_test/test_traced_layer_err_msg.py
index 16880dafdcbf7..4927fdea82a54 100644
--- a/test/legacy_test/test_traced_layer_err_msg.py
+++ b/test/legacy_test/test_traced_layer_err_msg.py
@@ -241,7 +241,7 @@ def test_linear_net_with_none(self):
 
 
 class TestTracedLayerSaveInferenceModel(unittest.TestCase):
-    """test save_inference_model will automaticlly create non-exist dir"""
+    """test save_inference_model will automatically create non-exist dir"""
 
     def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
diff --git a/test/legacy_test/test_trans_layout_op.py b/test/legacy_test/test_trans_layout_op.py
index da59301aacfc3..b936abc95df95 100644
--- a/test/legacy_test/test_trans_layout_op.py
+++ b/test/legacy_test/test_trans_layout_op.py
@@ -45,17 +45,17 @@ def test_check_output(self):
 class LayoutAutoTune(unittest.TestCase):
     def test_config(self):
         paddle.base.core.enable_layout_autotune()
-        if self.use_autoune():
+        if self.use_autotune():
             self.assertEqual(paddle.base.core.use_layout_autotune(), True)
             paddle.base.core.disable_layout_autotune()
         self.assertEqual(paddle.base.core.use_layout_autotune(), False)
-        self.use_autoune()
+        self.use_autotune()
 
     def setUp(self):
         paddle.disable_static()
-        self.use_autoune()
+        self.use_autotune()
 
-    def use_autoune(self):
+    def use_autotune(self):
         if paddle.is_compiled_with_cuda():
             paddle.incubate.autotune.set_config(
                 config={"layout": {"enable": True}}
diff --git a/test/legacy_test/test_transformer_api.py b/test/legacy_test/test_transformer_api.py
index 5945ac4a7d5d2..fd55abde22093 100644
--- a/test/legacy_test/test_transformer_api.py
+++ b/test/legacy_test/test_transformer_api.py
@@ -252,9 +252,9 @@ def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
         batch_size, src_len, d_model = x.shape
         x = x.reshape((batch_size * src_len, d_model))
         mu = np.mean(x, axis=1, keepdims=True)
-        sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+        sigma_square = np.sum(np.square(x - mu), axis=1) / d_model
         x1_up = x - mu
-        x1_down_1 = sigma_squar + epsilon
+        x1_down_1 = sigma_square + epsilon
         x1_down = np.sqrt(x1_down_1)
         x1_down = x1_down.reshape((x1_down.shape[0], 1))
         x1 = x1_up / x1_down
diff --git a/test/legacy_test/test_tril_triu_op.py b/test/legacy_test/test_tril_triu_op.py
index 36a2ddb0383a7..391b7f83f9dc3 100644
--- a/test/legacy_test/test_tril_triu_op.py
+++ b/test/legacy_test/test_tril_triu_op.py
@@ -116,7 +116,7 @@ def test_check_grad_normal(self):
 def case_generator(op_type, Xshape, diagonal, expected, dtype):
     """
     Generate testcases with the params shape of X, diagonal and op_type.
-    If arg`expercted` is 'success', it will register an Optest case and expect to pass.
+    If arg `expected` is 'success', it will register an OpTest case and expect to pass.
     Otherwise, it will register an API case and check the expect failure.
     """
     cls_name = (
diff --git a/test/legacy_test/test_truncated_gaussian_random_op.py b/test/legacy_test/test_truncated_gaussian_random_op.py
index eb8b502b082d4..0572d0da6face 100644
--- a/test/legacy_test/test_truncated_gaussian_random_op.py
+++ b/test/legacy_test/test_truncated_gaussian_random_op.py
@@ -22,7 +22,7 @@
 from paddle.base.executor import Executor
 
 
-class TestTrunctedGaussianRandomOp(unittest.TestCase):
+class TestTruncatedGaussianRandomOp(unittest.TestCase):
     def setUp(self):
         self.op_type = "truncated_gaussian_random"
         self.inputs = {}
diff --git a/test/legacy_test/test_vision_models.py b/test/legacy_test/test_vision_models.py
index b53f110030549..150ae03c58fe2 100644
--- a/test/legacy_test/test_vision_models.py
+++ b/test/legacy_test/test_vision_models.py
@@ -20,7 +20,7 @@
 from paddle.vision import models
 
 
-class TestVisonModels(unittest.TestCase):
+class TestVisionModels(unittest.TestCase):
     def models_infer(self, arch, pretrained=False, batch_norm=False):
         x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32)
         if batch_norm:
diff --git a/test/legacy_test/test_viterbi_decode_op.py b/test/legacy_test/test_viterbi_decode_op.py
index 91f79565a2caa..fd5ff4b61f789 100644
--- a/test/legacy_test/test_viterbi_decode_op.py
+++ b/test/legacy_test/test_viterbi_decode_op.py
@@ -31,7 +31,7 @@ def __call__(self, inputs, length):
         bs, seq_len, n_label = inputs.shape
         inputs_t = np.transpose(inputs, (1, 0, 2))
         trans_exp = np.expand_dims(self.transitions, axis=0)
-        historys = []
+        histories = []
         left_length = np.array(length)
         max_seq_len = np.amax(left_length)
         left_length = np.expand_dims(left_length, 1)
@@ -49,7 +49,7 @@ def __call__(self, inputs, length):
             alpha_exp = np.expand_dims(alpha, 2)
             alpha_trn_sum = alpha_exp + trans_exp
             max_res = np.amax(alpha_trn_sum, 1), np.argmax(alpha_trn_sum, 1)
-            historys = historys + [max_res[1]] if i >= 1 else []
+            histories = histories + [max_res[1]] if i >= 1 else []
             alpha_nxt = max_res[0] + logit
             mask = left_length > 0
             alpha = mask * alpha_nxt + (1 - mask) * alpha
@@ -61,7 +61,7 @@ def __call__(self, inputs, length):
         last_ids_update = last_ids * (left_length >= 0)
         batch_path = [last_ids_update]
         batch_offset = np.arange(bs) * n_label
-        for hist in reversed(historys):
+        for hist in reversed(histories):
             left_length = left_length + 1
             gather_idx = batch_offset + last_ids
             last_ids_update = np.take(hist, gather_idx) * (left_length > 0)
diff --git a/test/legacy_test/test_warpctc_op.py b/test/legacy_test/test_warpctc_op.py
index e747f381af608..9355eeec21ad5 100644
--- a/test/legacy_test/test_warpctc_op.py
+++ b/test/legacy_test/test_warpctc_op.py
@@ -110,7 +110,7 @@ def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence):
         required_times = labels_a_sequence.shape[0]
         old_label = -1
         for i in range(labels_a_sequence.shape[0]):
-            # two contingous labels with the same value
+            # two contiguous labels with the same value
             if labels_a_sequence[i, 0] == old_label:
                 required_times = required_times + 1
             old_label = labels_a_sequence[i, 0]
diff --git a/test/legacy_test/test_where_op.py b/test/legacy_test/test_where_op.py
index 16c6020d54650..d88b1b3b3a5a7 100644
--- a/test/legacy_test/test_where_op.py
+++ b/test/legacy_test/test_where_op.py
@@ -319,7 +319,7 @@ def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape):
                 np.testing.assert_array_equal(out[0], expect)
 
     def __test_where_with_type_promotion(
-        self, x_dtype, y_dtype, expeced_dtype=None
+        self, x_dtype, y_dtype, expected_dtype=None
     ):
         paddle.enable_static()
         main_program = paddle.static.Program()
@@ -367,7 +367,7 @@ def __test_where_with_type_promotion(
                     )
                     expect = np.where(cond_data, x_data_convert, y_data_convert)
                     np.testing.assert_array_equal(out[0], expect)
-                    self.assertEqual(out[0].dtype.__str__(), expeced_dtype)
+                    self.assertEqual(out[0].dtype.__str__(), expected_dtype)
                 else:
                     expect = np.where(cond_data, x_data, y_data)
                     np.testing.assert_array_equal(out[0], expect)
diff --git a/test/legacy_test/test_while_loop_op.py b/test/legacy_test/test_while_loop_op.py
index ec63375043e05..3b8123d48df4c 100644
--- a/test/legacy_test/test_while_loop_op.py
+++ b/test/legacy_test/test_while_loop_op.py
@@ -684,7 +684,7 @@ def type_error_cond_returns_not_variable():
 
             self.assertRaises(TypeError, type_error_cond_returns_not_variable)
 
-            # The type of `cond` returns in Op(while_loop) must be a bollean variable
+            # The type of `cond` returns in Op(while_loop) must be a boolean variable
             def type_error_cond_returns_not_boolean():
                 out = paddle.static.nn.while_loop(
                     cond_returns_not_bool_tensor, body, [data_1d]
diff --git a/test/legacy_test/test_while_op.py b/test/legacy_test/test_while_op.py
index d763576f9ff3a..69dc4e1b8c070 100644
--- a/test/legacy_test/test_while_op.py
+++ b/test/legacy_test/test_while_op.py
@@ -235,7 +235,7 @@ def body(i, s, x):
             x = paddle.static.data(shape=[-1], name='x', dtype='float32')
             func(x)
 
-        # NOTE(winter-wang): The while_op in pir mode  doesn't need following constrait, so hre only check when in non-pir mode.
+        # NOTE(winter-wang): The while_op in pir mode doesn't need following constraint, so here only check when in non-pir mode.
         if not in_pir_mode():
             for op in main_program.block(0).ops:
                 if op.type == "while":
diff --git a/test/legacy_test/test_zeros_like_op.py b/test/legacy_test/test_zeros_like_op.py
index 538556cd4f1fc..4768a2506f249 100644
--- a/test/legacy_test/test_zeros_like_op.py
+++ b/test/legacy_test/test_zeros_like_op.py
@@ -59,7 +59,7 @@ def test_api(self):
             self.assertEqual((outs[i] == np.zeros(shape, dtype)).all(), True)
 
 
-class TestZerosLikeImpeartive(unittest.TestCase):
+class TestZerosLikeImperative(unittest.TestCase):
     def test_out(self):
         shape = [3, 4]
         place = (
diff --git a/test/mkldnn/test_elementwise_add_mkldnn_op.py b/test/mkldnn/test_elementwise_add_mkldnn_op.py
index 9d62f1cf55e97..8b9dded0129bd 100644
--- a/test/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -102,21 +102,21 @@ def test_check_grad_ignore_x(self):
         pass
 
 
-class TestOneDNNlementwiseAddOpZeroDim(TestOneDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOpZeroDim(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.random((100,)).astype(self.dtype)
         self.y = np.array(3.0).astype(self.dtype)
         self.out = np.add(self.x, self.y)
 
 
-class TestOneDNNlementwiseAddOpZeroDim2(TestOneDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOpZeroDim2(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.array(3.0).astype(self.dtype)
         self.y = np.random.random((100,)).astype(self.dtype)
         self.out = np.add(self.x, self.y)
 
 
-class TestOneDNNlementwiseAddOpZeroDim3(TestOneDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOpZeroDim3(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.array(3.0).astype(self.dtype)
         self.y = np.array(3.0).astype(self.dtype)
@@ -127,7 +127,7 @@ def init_input_output(self):
 
 
 @skip_check_grad_ci(
-    reason="oneDNN's int8 elementwise_ops don't implemend grad kernel."
+    reason="oneDNN's int8 elementwise_ops don't implement grad kernel."
 )
 class TestInt8(TestElementwiseAddOp):
     def init_kernel_type(self):
diff --git a/test/mkldnn/test_fused_vit_attention.py b/test/mkldnn/test_fused_vit_attention.py
index 8c4876e4281b2..b980f8bff912a 100644
--- a/test/mkldnn/test_fused_vit_attention.py
+++ b/test/mkldnn/test_fused_vit_attention.py
@@ -23,7 +23,7 @@
 np.random.seed(0)
 
 
-def test_fuse_resenet_unit():
+def test_fuse_resnet_unit():
     tests = [[1, 4096, 768, 12], [10, 4097, 756, 12], [10, 4097, 756, 12]]
     for test in tests:
         batch_size = test[0]
@@ -74,4 +74,4 @@ def test_fuse_resenet_unit():
 
 
 if __name__ == '__main__':
-    test_fuse_resenet_unit()
+    test_fuse_resnet_unit()
diff --git a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
index ae44798dce4eb..fd9925df082e0 100644
--- a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -59,7 +59,7 @@ def setUp(self):
         N = len(self.lod[0])
 
         # fp32 X input for reference implementation and
-        # corressponding bf16 data as input to GRU oneDNN bf16 kernel
+        # corresponding bf16 data as input to GRU oneDNN bf16 kernel
         x_fp32 = np.random.rand(T, self.M).astype('float32')
         x_bf16 = convert_float_to_uint16(x_fp32)
 

From 7c9fcfa248c43c5caeb23ef3d752870b220bb90e Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 23 Feb 2024 14:05:44 +0800
Subject: [PATCH 19/82] [CINN Unittest] Add unittest for complex symbol shape
 (#61976)

* add unittest for complex symbol shape

* refine test
---
 .../symbolic/test_complex_symbol_subgraph.py  | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 test/ir/pir/cinn/symbolic/test_complex_symbol_subgraph.py

diff --git a/test/ir/pir/cinn/symbolic/test_complex_symbol_subgraph.py b/test/ir/pir/cinn/symbolic/test_complex_symbol_subgraph.py
new file mode 100644
index 0000000000000..04e0600b82654
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_complex_symbol_subgraph.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class ComplexSymbolSubgraph(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.hidden_size = 768
+        self.intermediate_size = 1008
+        self.linear = nn.Linear(
+            self.hidden_size, self.intermediate_size, bias_attr=False
+        )
+
+    def forward(self, a, b):
+        c = paddle.concat([a, a, b], 1)
+        d = self.linear(c)
+        return paddle.exp(d) - d
+
+
+class TestComplexSymbolSubgraph(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.shape = [1, 2048, 768]
+        self.hidden_states = paddle.randn(self.shape, dtype="float32")
+        self.hidden_states.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 2)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 2})
+
+    def eval(self, use_cinn):
+        paddle.seed(2024)
+        net = ComplexSymbolSubgraph()
+        input_spec = [
+            InputSpec(shape=[1, None, 768], dtype='float32'),
+            InputSpec(shape=[1, None, 768], dtype='float32'),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.hidden_states, self.hidden_states)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2de02d19f7254e050a82555ed5af9f249b5fa36b Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:06:16 +0800
Subject: [PATCH 20/82] modify paddledetection output_grad is none should pass
 call_vjp (#61927)

---
 python/paddle/autograd/backward_utils.py | 8 ++++++++
 python/paddle/autograd/ir_backward.py    | 5 ++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py
index e901a1fc5a7a5..e3e6326ba61cc 100644
--- a/python/paddle/autograd/backward_utils.py
+++ b/python/paddle/autograd/backward_utils.py
@@ -439,6 +439,14 @@ def all_stop_gradient_true(block):
     return True
 
 
+def all_output_grad_none(list_of_list):
+    for list_ in list_of_list:
+        for value in list_:
+            if value is not None:
+                return False
+    return True
+
+
 def parent_total_ops(block):
     '''
     when block is sub_block, forward op should include its parent block ops
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index aa5d1d990786e..9c751f82238fa 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -22,6 +22,7 @@
     ValueDict,
     ValueSet,
     _as_list,
+    all_output_grad_none,
     all_stop_gradient_true,
     argument_to_value,
     check_type,
@@ -630,7 +631,9 @@ def append_yield(
                     # all(zero_flag) support this op has no contribution for grad
                     # should be delete (prune sub_graph)
                     if (
-                        len(output_grads) == 0 or all(zero_flag)
+                        len(output_grads) == 0
+                        or all(zero_flag)
+                        or all_output_grad_none(output_grads)
                     ) and op.name() not in [
                         "pd_op.while",
                         "pd_op.if",

From c2286f135c3397286c5d8053ef556424a9b3a1d8 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:22:35 +0800
Subject: [PATCH 21/82] optimize proxy of ci (#61988)

* delete --force-install

* optimize proxy of ci
---
 paddle/scripts/paddle_build.sh          | 3 ++-
 tools/auto_parallel/ci_auto_parallel.sh | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 320e969ef73bb..71ee30a115ef7 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3348,7 +3348,8 @@ function distribute_test() {
 
     echo "Dowloading ...."
     cd ${work_dir}
-    git clone --depth=1 https://github.com/PaddlePaddle/PaddleNLP.git -b stable/paddle-ci
+    wget https://paddlenlp.bj.bcebos.com/wheels/PaddleNLP_stable_paddle.tar.gz --no-proxy
+    tar -zvxf PaddleNLP_stable_paddle.tar.gz 
     cd PaddleNLP
     sed -i '/lac/d' scripts/regression/requirements_ci.txt
 
diff --git a/tools/auto_parallel/ci_auto_parallel.sh b/tools/auto_parallel/ci_auto_parallel.sh
index e536fe9df9fc5..21468833321ef 100644
--- a/tools/auto_parallel/ci_auto_parallel.sh
+++ b/tools/auto_parallel/ci_auto_parallel.sh
@@ -24,7 +24,7 @@ export case_list=()
 install_paddle(){
     echo -e "\033[31m ---- Install paddlepaddle-gpu  \033"
     if [ -n "$paddle" ];then
-      python -m pip install --user ${paddle} --force-reinstall --no-dependencies;
+      python -m pip install --user ${paddle} --no-dependencies;
     fi
     python -c "import paddle; print('paddle version:',paddle.__version__,'\npaddle commit:',paddle.version.commit)";
 }

From b99ca0f01c64a4b242da05a63db2a8eb7aa45a7c Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:25:14 +0800
Subject: [PATCH 22/82] Fix sincethe, diffrent, etc. (#61953)

---
 paddle/cinn/backends/codegen_cuda_dev.cc       |  2 +-
 paddle/cinn/frontend/interpreter.cc            |  2 +-
 .../transforms/cinn_group_cluster_pass.cc      |  6 +++---
 paddle/cinn/hlir/framework/graph.h             |  4 ++--
 paddle/cinn/hlir/framework/instruction_test.cc |  2 +-
 paddle/cinn/hlir/framework/memory.h            |  2 +-
 paddle/cinn/hlir/framework/node.h              |  4 ++--
 paddle/cinn/hlir/framework/op_lowering_impl.cc |  2 +-
 paddle/cinn/hlir/framework/op_lowering_impl.h  |  4 ++--
 .../hlir/framework/op_lowering_impl_base.h     |  2 +-
 paddle/cinn/hlir/framework/op_lowering_test.cc |  2 +-
 paddle/cinn/hlir/framework/op_lowering_util.cc |  4 ++--
 paddle/cinn/hlir/framework/pir_compiler.h      |  2 +-
 .../default_horizontal_fuse_pass.cc            |  2 +-
 .../default_input_fuse_pass.cc                 |  2 +-
 paddle/cinn/hlir/pass/op_fusion_pass.cc        |  2 +-
 paddle/cinn/hlir/pe/load_x86_params.cc         |  2 +-
 paddle/cinn/hlir/pe/reduction.cc               |  4 ++--
 paddle/cinn/hlir/pe/schedule.cc                | 14 +++++++-------
 paddle/cinn/hlir/pe/schedule.h                 |  2 +-
 paddle/cinn/hlir/pe/transform.h                |  6 +++---
 paddle/cinn/ir/buffer.h                        |  2 +-
 .../group_schedule/st_shape_group_scheduler.cc |  2 +-
 .../tactic/compute_inline_tactic.cc            |  2 +-
 .../ir/schedule/impl/loop_transformation.cc    |  2 +-
 paddle/cinn/ir/schedule/ir_schedule_util.h     |  4 ++--
 paddle/cinn/ir/schedule/schedule_desc.h        |  6 +++---
 paddle/cinn/ir/tensor.cc                       |  4 ++--
 paddle/cinn/ir/utils/ir_nodes_collector.h      |  2 +-
 paddle/cinn/lang/README.md                     |  2 +-
 paddle/cinn/lang/lower_impl.cc                 |  2 +-
 paddle/cinn/optim/buffer_assign.cc             |  2 +-
 paddle/cinn/optim/compute_inline_expand.cc     |  2 +-
 paddle/cinn/optim/resize_buffer.cc             |  2 +-
 paddle/cinn/optim/update_buffer_axis_pass.cc   |  2 +-
 paddle/cinn/optim/vectorize_loops.cc           |  2 +-
 paddle/cinn/poly/poly_scheduler.cc             |  2 +-
 paddle/cinn/poly/stage.h                       |  8 ++++----
 paddle/cinn/pybind/common.cc                   |  2 +-
 paddle/cinn/runtime/custom_function_test.cc    |  4 ++--
 paddle/cinn/runtime/tiny_runtime.cc            |  2 +-
 paddle/cinn/utils/event.h                      |  2 +-
 paddle/cinn/utils/multi_threading.h            |  2 +-
 paddle/cinn/utils/random_engine.h              |  2 +-
 paddle/cinn/utils/registry.h                   |  2 +-
 paddle/cinn/utils/string.h                     |  2 +-
 .../ps/service/coordinator_client.cc           |  2 +-
 .../fluid/distributed/ps/service/ps_client.h   |  2 +-
 .../distributed/ps/service/ps_graph_client.cc  |  2 +-
 .../distributed/ps/service/ps_local_client.h   |  2 +-
 .../ps/service/ps_service/graph_py_service.cc  |  4 ++--
 .../fluid/distributed/ps/table/depends/dense.h |  2 +-
 .../fluid/distributed/ps/table/tensor_table.h  |  2 +-
 .../fluid/distributed/test/graph_node_test.cc  |  2 +-
 paddle/fluid/eager/general_grad.h              |  4 ++--
 .../eager/to_static/run_program_op_node.h      |  2 +-
 .../details/fetch_barrier_op_handle.h          |  2 +-
 .../framework/ir/transfer_layout_elim_pass.cc  |  2 +-
 .../new_executor/interpreter/data_transfer.cc  | 10 +++++-----
 .../framework/new_executor/new_executor_defs.h |  4 ++--
 paddle/fluid/imperative/layer.cc               |  2 +-
 paddle/fluid/imperative/layout_transformer.h   |  2 +-
 paddle/fluid/imperative/parallel_context.h     |  2 +-
 paddle/fluid/imperative/partial_grad_engine.cc |  2 +-
 .../inference/api/onnxruntime_predictor.h      |  2 +-
 paddle/fluid/inference/api/paddle_api.h        |  4 ++--
 paddle/fluid/inference/api/paddle_tensor.h     |  2 +-
 paddle/fluid/inference/capi_exp/pd_config.h    | 18 +++++++++---------
 paddle/fluid/inference/capi_exp/pd_predictor.h |  2 +-
 .../ir_adaptor/translator/op_compat_info.h     |  2 +-
 .../ir_adaptor/translator/op_translator.cc     |  4 ++--
 .../ir_adaptor/translator/program_translator.h |  6 +++---
 .../pir/transforms/pd_op_to_kernel_pass.cc     |  8 ++++----
 .../transforms/transform_general_functions.h   |  2 +-
 paddle/fluid/pybind/eager_method.cc            |  8 ++++----
 paddle/fluid/pybind/pir.cc                     |  4 ++--
 .../phi/api/yaml/generator/backward_api_gen.py |  2 +-
 .../phi/api/yaml/generator/dist_bw_api_gen.py  |  2 +-
 .../core/distributed/auto_parallel/dist_attr.h |  2 +-
 paddle/phi/infermeta/spmd_rules/concat.cc      |  4 ++--
 .../spmd_rules/cross_entropy_with_softmax.cc   | 12 ++++++------
 paddle/phi/kernels/cpu/rnn_grad_kernel.cc      |  2 +-
 paddle/phi/kernels/gpu/mode_grad_kernel.cu     |  2 +-
 paddle/phi/kernels/gpu/mode_kernel.cu          |  2 +-
 paddle/phi/kernels/gpu/top_k_grad_kernel.cu    |  2 +-
 paddle/phi/kernels/gpu/top_k_kernel.cu         |  4 ++--
 86 files changed, 142 insertions(+), 142 deletions(-)

diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 83b5da6c0b138..eb70ebe8fff8e 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -436,7 +436,7 @@ void CodeGenCUDA_Dev::Visit(const ir::Let *op) {
     str_ += " ";
     IrPrinter::Visit(op->symbol);
     vectorized_tensor_names_.insert(utils::GetStreamCnt(op->symbol));
-    // skip "=0" in "half8 temp = 0;" sincethe operator= of half8 may not
+    // skip "=0" in "half8 temp = 0;" since the operator= of half8 may not
     // overloaded.
     if (op->body.As<ir::IntImm>() && op->body.As<ir::IntImm>()->value == 0) {
       return;
diff --git a/paddle/cinn/frontend/interpreter.cc b/paddle/cinn/frontend/interpreter.cc
index 2a5685572a045..12964fb8e79ad 100644
--- a/paddle/cinn/frontend/interpreter.cc
+++ b/paddle/cinn/frontend/interpreter.cc
@@ -108,7 +108,7 @@ void Interpreter::Impl::Build(const Target& target,
                               const std::string& model_name) {
   CHECK(!var_map_.empty());
   VLOG(3) << "Program:\n" << *program_;
-  // applay frontend pass
+  // apply frontend pass
   std::unordered_set<std::string> fetch_var_ids;
   for (auto& name : fetch_names_) {
     CHECK(var_map_.count(name)) << "var_map finds no fetch var " << name;
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
index b36afc9bd056f..9f9856004646f 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/cinn_group_cluster_pass.cc
@@ -563,7 +563,7 @@ bool CanOpMergeNode(
   }
 
   // TODO(phlrain): need update here
-  // diffrent loop range can merge, like [128, 128, 1], with [128, 128]
+  // different loop range can merge, like [128, 128, 1], with [128, 128]
   if ((cinn::hlir::framework::pir::CompatibleInfo::OpKind(*cur_op) !=
        cinn::hlir::framework::kBroadcast) &&
       (op_path_info.at(cur_op).loop_ranges !=
@@ -584,7 +584,7 @@ bool ShouldOutputPreNode(
   }
 
   // TODO(phlrain): need update here
-  // diffrent loop range can merge, like [128, 128, 1], with [128, 128]
+  // different loop range can merge, like [128, 128, 1], with [128, 128]
   if ((cinn::hlir::framework::pir::CompatibleInfo::OpKind(*cur_op) !=
        cinn::hlir::framework::kBroadcast) &&
       (op_path_info.at(cur_op).loop_ranges !=
@@ -599,7 +599,7 @@ std::vector<GroupClusterNode> NodeMergeWithNode(
     const std::vector<GroupClusterNode>& first_stage_output) {
   // stage 2 merge
   // for now we merge node in same pass
-  // only for vertial fuse
+  // only for vertical fuse
   std::vector<GroupClusterNode> second_stage_output = first_stage_output;
   while (true) {
     bool fused = false;
diff --git a/paddle/cinn/hlir/framework/graph.h b/paddle/cinn/hlir/framework/graph.h
index d6ef914f0846e..9ce024059439c 100644
--- a/paddle/cinn/hlir/framework/graph.h
+++ b/paddle/cinn/hlir/framework/graph.h
@@ -189,7 +189,7 @@ class Graph : public cinn::common::Graph {
                        SharedGroupHasher,
                        SharedGroupComparator>
         producer_groups_;
-    // output grous
+    // output groups
     std::unordered_set<std::shared_ptr<Group>,
                        SharedGroupHasher,
                        SharedGroupComparator>
@@ -271,7 +271,7 @@ class Graph : public cinn::common::Graph {
       const std::unordered_set<std::string>& fetch_var_ids = {});
 
   /**
-   * \brief Genereate the python test code for group test
+   * \brief Generate the python test code for group test
    */
   std::string GenerateGroupPythonCode(
       const std::vector<Node*>& group,
diff --git a/paddle/cinn/hlir/framework/instruction_test.cc b/paddle/cinn/hlir/framework/instruction_test.cc
index 2e2b412cf4fdf..f665c628b5a0a 100644
--- a/paddle/cinn/hlir/framework/instruction_test.cc
+++ b/paddle/cinn/hlir/framework/instruction_test.cc
@@ -104,7 +104,7 @@ TEST(Instruction, RunWithRawPodArgs) {
   const auto& shape = Shape({M, N});
 
   std::map<std::string, cinn_pod_value_t> name2podargs;
-  // case 1: create cinn_pod_value_t arguments dicrectly
+  // case 1: create cinn_pod_value_t arguments directly
   std::vector<cinn_buffer_t> args_buffer(
       3);  // store {"x", "y", "z"} buffer objects
   auto* default_memory_mng = MemoryManager::Global().RetrieveSafely(
diff --git a/paddle/cinn/hlir/framework/memory.h b/paddle/cinn/hlir/framework/memory.h
index 3b8c59887d7fe..889e32e7fca0b 100644
--- a/paddle/cinn/hlir/framework/memory.h
+++ b/paddle/cinn/hlir/framework/memory.h
@@ -37,7 +37,7 @@ class MemoryInterface {
 };
 
 /**
- * MemoryManager holds a map of MemoryInterface for each articture.
+ * MemoryManager holds a map of MemoryInterface for each architecture.
  */
 class MemoryManager final {
  public:
diff --git a/paddle/cinn/hlir/framework/node.h b/paddle/cinn/hlir/framework/node.h
index 764492df45f38..7f08257bd2d9c 100644
--- a/paddle/cinn/hlir/framework/node.h
+++ b/paddle/cinn/hlir/framework/node.h
@@ -77,7 +77,7 @@ class Node : public cinn::common::GraphNode {
   std::tuple<cinn::common::GraphEdge *, cinn::common::GraphEdge *> LinkTo(
       NodeData *other);
 
-  // This node determines another node, which means the other node depeneds on
+  // This node determines another node, which means the other node depends on
   // this node.
   void Controls(NodeData *other);
 
@@ -161,7 +161,7 @@ class NodeData : public cinn::common::GraphNode {
   std::tuple<cinn::common::GraphEdge *, cinn::common::GraphEdge *> LinkTo(
       Node *other);
 
-  // This node determines another node, which means the other node depeneds on
+  // This node determines another node, which means the other node depends on
   // this node.
   void Controls(Node *other);
 
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.cc b/paddle/cinn/hlir/framework/op_lowering_impl.cc
index cef5968639511..a9bb46c8a4f26 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_impl.cc
@@ -547,7 +547,7 @@ ir::Expr OpLowererImpl::DoGroupSchedule(
               << ir_sch.GetModule().GetExprs().at(0);
       continue;
     }
-    // find master to computeat.
+    // find master to compute at.
     auto master = GetMasterToComputeAt(node,
                                        nodes_in_order,
                                        nodes_inline,
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl.h b/paddle/cinn/hlir/framework/op_lowering_impl.h
index 5e57c607c93e1..80c79b3c64b8d 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl.h
@@ -29,7 +29,7 @@
 
 // Fusion Op lowering, there are four kinds of lowering function:
 // Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
-// Elementwise/Broadcast/Injective Ops is with same shcedule.
+// Elementwise/Broadcast/Injective Ops is with same schedule.
 // Reduce,OutEWiseFusable,NonFusible are using different schedule.
 
 namespace cinn {
@@ -183,7 +183,7 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
   const absl::flat_hash_map<std::string, Type>& type_dict_;
   const absl::flat_hash_map<std::string, shape_t>& shape_dict_;
 
-  // fucntion name prefix
+  // function name prefix
   const std::string func_name_prefix = "fn_";
 };
 
diff --git a/paddle/cinn/hlir/framework/op_lowering_impl_base.h b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
index b67deedbbb7c5..edd5c6e8e627e 100644
--- a/paddle/cinn/hlir/framework/op_lowering_impl_base.h
+++ b/paddle/cinn/hlir/framework/op_lowering_impl_base.h
@@ -20,7 +20,7 @@
 
 // Fusion Op lowering, there are four kinds of lowering function:
 // Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
-// Elementwise/Broadcast/Injective Ops is with same shcedule.
+// Elementwise/Broadcast/Injective Ops is with same schedule.
 // Reduce,OutEWiseFusable,NonFusible are using different schedule.
 
 namespace cinn {
diff --git a/paddle/cinn/hlir/framework/op_lowering_test.cc b/paddle/cinn/hlir/framework/op_lowering_test.cc
index 07fcc7a48e016..be33fa25125d2 100644
--- a/paddle/cinn/hlir/framework/op_lowering_test.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_test.cc
@@ -208,7 +208,7 @@ TEST(OP_LOWERING, Reduce_Fuse_Broadcast_Softmax) {
   {
     // softmax
     auto A = net_builder.CreateInput(Float(32), {h, w}, "A");
-    // redece max
+    // reduce max
     auto B = net_builder.ReduceMax(A, {1});
     // broadcast
     auto C = net_builder.BroadcastTo(B, {h, w}, {0});
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
index a7b988a735cdb..2366fd584aa0b 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -622,7 +622,7 @@ void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,  // NOLINT
           // the loop size at axis is 1, need remove
           axes_shift_num[j] = -1;
         } else if (axes[j] > idx) {
-          // the axies value need left shift
+          // the axes value need left shift
           axes_shift_num[j]++;
         }
       }
@@ -902,7 +902,7 @@ Node* GetMasterToComputeAt(
         done_schedule.insert(tmp);
       }
     }
-    // remove all consuemr reducer node of node from done_schedule.
+    // remove all consumer reducer node of node from done_schedule.
     std::unordered_set<Node*> visited;
     std::queue<Node*> candidates;
     candidates.push(node);
diff --git a/paddle/cinn/hlir/framework/pir_compiler.h b/paddle/cinn/hlir/framework/pir_compiler.h
index bcb1e835e3cb1..5edf5e25bf46b 100644
--- a/paddle/cinn/hlir/framework/pir_compiler.h
+++ b/paddle/cinn/hlir/framework/pir_compiler.h
@@ -28,7 +28,7 @@ namespace hlir {
 namespace framework {
 
 // TODO(Aurelius84): Need abstract this logic to implement Proxy for
-// the co-existance with GraphCompiler.
+// the co-existence with GraphCompiler.
 class PirCompiler final {
  public:
   PirCompiler(const ::pir::Program& prog,
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_horizontal_fuse_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_horizontal_fuse_pass.cc
index 36ac222e099c4..e953caf20ab7a 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_horizontal_fuse_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_horizontal_fuse_pass.cc
@@ -71,7 +71,7 @@ class DefaultHorizontalFusePass final : public HorizontalFusePass {
         break;
       }
 
-      // if can't fuse to othors Groups, new Groups.
+      // if can't fuse to other Groups, new Groups.
       if (!fusionable) {
         fusionable_consumers.push_back({candidate});
       }
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_input_fuse_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_input_fuse_pass.cc
index a5ba335f049f1..7dc68d65599f9 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_input_fuse_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/default_input_fuse_pass.cc
@@ -72,7 +72,7 @@ class DefaultInputFusePass final : public InputFusePass {
         break;
       }
 
-      // if can't fuse to othors Groups, new Groups.
+      // if can't fuse to other Groups, new Groups.
       if (!fusionable) {
         fusionable_consumers.push_back({candidate});
       }
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass.cc b/paddle/cinn/hlir/pass/op_fusion_pass.cc
index 242b72f77e77f..1f9922899b69f 100644
--- a/paddle/cinn/hlir/pass/op_fusion_pass.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass.cc
@@ -361,7 +361,7 @@ class OpFusionPassHelper : public FusionHelperBase {
   struct FusionRelation {
     // producer -> consumer
     std::unordered_set<framework::OpPatternKind> op_kind = {};
-    // producer -> fusion sonsumer
+    // producer -> fusion consumer
     std::unordered_map<framework::OpPatternKind, ConditionFunction>
         fusion_op_kind = {};
   };
diff --git a/paddle/cinn/hlir/pe/load_x86_params.cc b/paddle/cinn/hlir/pe/load_x86_params.cc
index aa0fd02218f90..36278f5a0b276 100644
--- a/paddle/cinn/hlir/pe/load_x86_params.cc
+++ b/paddle/cinn/hlir/pe/load_x86_params.cc
@@ -156,7 +156,7 @@ void LoadX86DefaultParams(
   InputX86Param(model_data,
                 "X86ScheduleConv input 1 256 56 56 weight 512 256 1 1 stride 2 "
                 "2 padding 0 0 dilation 1 1",
-                // Todo: tempory fix, enhance alterlayout and test performance
+                // Todo: temporary fix, enhance alterlayout and test performance
                 {{"ic_bn", {1, 256}},
                  {"oc_bn", {16, 32}},
                  {"ow_bn", {7, 4}},
diff --git a/paddle/cinn/hlir/pe/reduction.cc b/paddle/cinn/hlir/pe/reduction.cc
index e4850e96dabcd..7e33a1475e48b 100644
--- a/paddle/cinn/hlir/pe/reduction.cc
+++ b/paddle/cinn/hlir/pe/reduction.cc
@@ -287,7 +287,7 @@ std::vector<Tensor> WarpReduce(const ir::Tensor& A,
     reduce_width = reduce_width * A->shape[idx].as_int32();
   }
 
-  // comput tmp output shape.
+  // compute tmp output shape.
   std::vector<Expr> tmp_shape(A->shape.begin(),
                               A->shape.begin() + shape_size_without_reduce_dim);
   tmp_shape.push_back(Expr(32));
@@ -390,7 +390,7 @@ std::vector<ir::Tensor> BlockReduceInternal(const ir::Tensor& A,
   auto tmp_out = Compute(
       tmp_shape,
       [=](const std::vector<Expr>& indexs) -> Expr {
-        // comput index map from output to input.
+        // compute index map from output to input.
         auto last_index = indexs.back();
         std::vector<Expr> input_indexs(indexs.begin(),
                                        indexs.begin() + indexs.size() - 1);
diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc
index c75f9aefccf29..3c3067ce436ab 100644
--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -220,7 +220,7 @@ void MatmulScheduleCPU(poly::StageMap stages,
   int packed_last_dim = packedB->shape[packedB_dims - 1].as_int32();
   int packedB_split_factor =
       GetBetterSplitFactor(packed_last_dim, basic_split_factor);
-  // tempory solution for indivisible case
+  // temporary solution for indivisible case
   if (packedB_split_factor >= 8 &&
       packed_last_dim % packedB_split_factor == 0) {
     stages[packedB]->Vectorize(packedB_dims - 1, packedB_split_factor);
@@ -243,7 +243,7 @@ void MatmulScheduleCPU(poly::StageMap stages,
   std::vector<poly::Iterator> all_axes_inner;
   bool is_m_splited = false;
   bool is_n_splited = false;
-  // tempory solution for isl for1 wrong elimination
+  // temporary solution for isl for1 wrong elimination
   if (bm >= 4 && M != bm) {
     auto axes = stages[output]->Split(i_axis, bm);
     all_axes_outer.push_back(std::get<0>(axes));
@@ -305,7 +305,7 @@ void MatmulScheduleCPU(poly::StageMap stages,
     std::swap(all_axes[out_axis_dims - 1], all_axes[out_axis_dims - 2]);
   }
   stages[output]->Reorder(all_axes);
-  // vectorize output's last dimemsion
+  // vectorize output's last dimension
   auto out_domain = stages[output]->transformed_domain();
   auto range =
       poly::isl_set_get_axis_range(out_domain.get(), out_axis_dims - 1);
@@ -315,7 +315,7 @@ void MatmulScheduleCPU(poly::StageMap stages,
   int out_last_dim = max.get_num_si() + 1;
   int output_split_factor =
       GetBetterSplitFactor(out_last_dim, basic_split_factor);
-  // tempory solution for indivisible case
+  // temporary solution for indivisible case
   if (output_split_factor >= 8 && packed_last_dim % output_split_factor == 0) {
     stages[output]->Vectorize(out_axis_dims - 1, output_split_factor);
   }
@@ -945,7 +945,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
   // oh_inner, ow, oc_inner, ic, kh, kw]
   stages[CC]->ComputeAt2(stages[packed_out], 0);
   VLOG(3) << "cache write shape: " << utils::Join(CC->shape, ", ");
-  // tempory solution because reorder may be wrong before ComputeAt
+  // temporary solution because reorder may be wrong before ComputeAt
   // reorder: [batch_oc_outer_oh_outer_fused, oh_inner, ow_outer, ow_inner,
   // oc_inner] -> [batch_oc_outer_oh_outer_fused, ow_outer, oh_inner, ow_inner,
   // oc_inner]
@@ -1082,7 +1082,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
           << stages[packed_out]->transformed_domain();
   VLOG(3) << "stages[CC]->transformed_domain()"
           << stages[CC]->transformed_domain();
-  // tempory solution because reordering before computeAt may be wrong
+  // temporary solution because reordering before computeAt may be wrong
   // reorder: [batch, oc_outer, oh_outer, oh_inner, ow_outer, ow_inner,
   // oc_inner] -> [batch, oc_outer, oh_outer, ow_outer, oh_inner, ow_inner,
   // oc_inner]
@@ -2700,7 +2700,7 @@ void CudaScheduleInjectiveWithVectorize(poly::Stage *stage,
   // the first bind position from tail
   int bind_idx = stage->n_out_dims() - 1;
   // it will add a new dim by split before vectorize, but the new dim will
-  // be eleminated when vectorizng, so the bind_idx does't need to increase
+  // be eliminated when vectorizing, so the bind_idx does't need to increase
   if (vector_width > 1) {
     stage->Split(bind_idx, vector_width);
   }
diff --git a/paddle/cinn/hlir/pe/schedule.h b/paddle/cinn/hlir/pe/schedule.h
index 8e863c50e5b6c..7aef85c77518e 100644
--- a/paddle/cinn/hlir/pe/schedule.h
+++ b/paddle/cinn/hlir/pe/schedule.h
@@ -182,7 +182,7 @@ void CudaScheduleMul(poly::StageMap stages,
                      const std::vector<int> &output_shape,
                      const cinn::common::Target &target);
 
-// reduce shedules.
+// reduce schedules.
 void CudaReduceSchedule(poly::StageMap stages,
                         ir::Tensor output,
                         int last_dimension_num,
diff --git a/paddle/cinn/hlir/pe/transform.h b/paddle/cinn/hlir/pe/transform.h
index 8f46ae400694f..ad3ca5a0f9caa 100644
--- a/paddle/cinn/hlir/pe/transform.h
+++ b/paddle/cinn/hlir/pe/transform.h
@@ -154,7 +154,7 @@ ir::Tensor Reverse(const ir::Tensor& input,
 /**
  * @brief Perform meta op Transpose
  * @param input The input tensor
- * @param axis tranpsoe axis
+ * @param axis transpose axis
  * @param output_name the name of the output tensor
  */
 ir::Tensor Transpose(
@@ -197,8 +197,8 @@ ir::Tensor SliceSymbolic(const ir::Tensor& A,
  * @param input The input tensor
  * @param assign The assign tensor
  * @param axis select axis
- * @param starts select reigon starts
- * @param strides select reigon strides
+ * @param starts select region starts
+ * @param strides select region strides
  * @param output_name the name of the output tensor
  */
 ir::Tensor SliceAssign(
diff --git a/paddle/cinn/ir/buffer.h b/paddle/cinn/ir/buffer.h
index b5e162ae52bc6..5b173b6792c19 100755
--- a/paddle/cinn/ir/buffer.h
+++ b/paddle/cinn/ir/buffer.h
@@ -83,7 +83,7 @@ class _Buffer_ : public ExprNode<_Buffer_> {
   int offset_factor{0};
   //! The place the buffer locates.
   Target target{UnkTarget()};
-  //! Aignment requirement of data pointer in bytes.
+  //! Alignment requirement of data pointer in bytes.
   mutable int data_alignment{0};
   //! The memory type of the buffer.
   MemoryType memory_type{MemoryType::Heap};
diff --git a/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc b/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
index 86f114def4146..7c999205f646f 100644
--- a/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
+++ b/paddle/cinn/ir/group_schedule/st_shape_group_scheduler.cc
@@ -373,7 +373,7 @@ void StaticShapeGroupScheduler::DoLoopAlignment() {
       source_loops = {source_loop};
     }
 
-    // 3. Rerorder loops to match the target loops
+    // 3. Reorder loops to match the target loops
     if (total_source_extent == total_master_loop_extents) {
       ir_sch_->Reorder(node->id(), recover_loop_order);
     }
diff --git a/paddle/cinn/ir/group_schedule/tactic/compute_inline_tactic.cc b/paddle/cinn/ir/group_schedule/tactic/compute_inline_tactic.cc
index e58929eb0845b..8da8f44d32695 100644
--- a/paddle/cinn/ir/group_schedule/tactic/compute_inline_tactic.cc
+++ b/paddle/cinn/ir/group_schedule/tactic/compute_inline_tactic.cc
@@ -32,7 +32,7 @@ void ComputeInlineTactic::Init(ScheduleContext* context) {
 
 void ComputeInlineTactic::Apply(ir::IRSchedule* sch,
                                 const std::string& block_id) {
-  // TODO(LiuYang): Compute of ops will be rewrited so that we
+  // TODO(LiuYang): Compute of ops will be rewritten so that we
   // don't use it in dynamic group_schedule rules temporarily.
   // if (IsProhibitScheduleExternCallBlock(node->Block())) {
   //    return;
diff --git a/paddle/cinn/ir/schedule/impl/loop_transformation.cc b/paddle/cinn/ir/schedule/impl/loop_transformation.cc
index e222489b53daf..b320f6ace3f69 100644
--- a/paddle/cinn/ir/schedule/impl/loop_transformation.cc
+++ b/paddle/cinn/ir/schedule/impl/loop_transformation.cc
@@ -166,7 +166,7 @@ std::vector<Expr> DyScheduleImpl::Split(const Expr& loop,
   CINN_IR_SCHEDULE_END(this->err_msg_level_);
 }
 
-// TODO(@LiuYang): now -1 can't exsit in factors,
+// TODO(@LiuYang): now -1 can't exist in factors.
 std::vector<Expr> DyScheduleImpl::Split(const Expr& loop,
                                         const std::vector<Expr>& factors) {
   CINN_IR_SCHEDULE_BEGIN();
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.h b/paddle/cinn/ir/schedule/ir_schedule_util.h
index ecf0521555d59..c71f12cab42c7 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.h
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.h
@@ -128,7 +128,7 @@ void ReplaceExpr(Expr* source,
  * and change -1 to positive integer.
  * @param factors The original factors.
  * @param total_extent The extent of the loop to be splitted.
- * @return return The valiated factors.
+ * @return return The validated factors.
  */
 std::vector<int> ValidateFactors(const std::vector<int>& factors,
                                  int total_extent,
@@ -312,7 +312,7 @@ IterRange RangeUnion(const IterRange& range1, const IterRange& range2);
  * block
  * \param is_store_provided Whether Store nodes of the block provide the
  * tensor, true means it is in compute_at case, otherwise false means in
- * reverse_compuate_at case
+ * reverse_compute_at case
  * \return Each index's range and can_keep_loop flag of block's tensor.
  * Indicating the buffer region being required.
  */
diff --git a/paddle/cinn/ir/schedule/schedule_desc.h b/paddle/cinn/ir/schedule/schedule_desc.h
index 4458bcb4ed117..db7dc551e7ddd 100644
--- a/paddle/cinn/ir/schedule/schedule_desc.h
+++ b/paddle/cinn/ir/schedule/schedule_desc.h
@@ -31,10 +31,10 @@ namespace ir {
 // records all transform/getting operations executed by a corresponding
 // ir::IRSchedule. A ScheduleDesc can be serialized to JSON format and saved to
 // file. For deserializing, it can be re-applied to a new IRSchedule that is
-// initialzied by a semantics-equal original ir::ModuleExpr, and then achieves
+// initialized by a semantics-equal original ir::ModuleExpr, and then achieves
 // the same result.
 
-class IRSchedule;  // forward declartion to avoid cross-reference
+class IRSchedule;  // forward declaration to avoid cross-reference
 class ScheduleDesc {
  public:
   // each operation executed through IRSchedule is recorded as a step
@@ -77,7 +77,7 @@ class ScheduleDesc {
   void Pop();
 
   /**
-   * \brief Replay this description to a new IRSchedule that is initialzied by a
+   * \brief Replay this description to a new IRSchedule that is initialized by a
    * semantics-equal original ModuleExpr.
    * @param schedule The original IRSchedule to be replayed the description on.
    * @param without_post_schedule Determine whether to delete the post
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index fe746f1b8daa3..7b3f15c6ed0be 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -336,7 +336,7 @@ ir::Tensor _Tensor_::InitReduction(poly::StageMap stages,
                                    const Target &target) const {
   CHECK(contains_reduce_axis())
       << "InitReduction only works on a reduce tensor";
-  // return if already rexists.
+  // return if already exists.
   std::string init_reduce_tensor_name = GenReduceInitTensorNameOf(name);
   if (stages->Lookup(init_reduce_tensor_name))
     return stages[this]->LookupCtrlDepend(init_reduce_tensor_name);
@@ -471,7 +471,7 @@ void _Tensor_::Bind(lang::Buffer &buffer) {
     if (this->buffer == buffer.buffer()) return;
     this->buffer->Unbind(this);
   }
-  // Extract the tensors thouse has binded to this buffer.
+  // Extract the tensors those has binded to this buffer.
   buffer_depended_tensor_names_ = buffer.buffer()->binded_tensor_names();
 
   buffer.buffer()->BindTo(this);
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.h b/paddle/cinn/ir/utils/ir_nodes_collector.h
index 7bfb1b3b4e6b3..28f77b3c7021c 100644
--- a/paddle/cinn/ir/utils/ir_nodes_collector.h
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.h
@@ -80,7 +80,7 @@ std::map<std::string, Expr> CollectTensorMap(
 std::vector<std::string> CollectUndefinedVars(const Expr* e);
 
 /**
- * Collect the Tensor Nodes which will be Writed by Store or Call Nodes
+ * Collect the Tensor Nodes which will be written by Store or Call Nodes
  */
 std::set<std::string> CollectTensorNeedsWrite(const Expr* e);
 }  // namespace ir_utils
diff --git a/paddle/cinn/lang/README.md b/paddle/cinn/lang/README.md
index 078759b6f4d25..2e6c81ad655ac 100644
--- a/paddle/cinn/lang/README.md
+++ b/paddle/cinn/lang/README.md
@@ -1,6 +1,6 @@
 # Design of CINN/DSL
 This module is a simple DSL defined in CINN project.
-The DSL module aims to represent the overall computation in a hardware indenpendent way.
+The DSL module aims to represent the overall computation in a hardware independent way.
 
 ## Concepts
 ### Object
diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc
index 41904b7742d82..1b085c03e2240 100644
--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -108,7 +108,7 @@ Expr LowerGroup(const poly::ScheduleGroup& group,
   // poly::IslAstNodeToCinnExpr(ast, &e);
   poly::IslAstNodeToCinnExpr(ast, gen.domain(), &e);
   // now we get a workable expression, but the statement are something like
-  // `B(((16 * po0) + po1), po2)`, we need to transform this to some realworld
+  // `B(((16 * po0) + po1), po2)`, we need to transform this to some real world
   // statement in CINN.
 
   VLOG(1) << "ast to expr: \n" << e << std::endl;
diff --git a/paddle/cinn/optim/buffer_assign.cc b/paddle/cinn/optim/buffer_assign.cc
index 256624617cc43..47cf714e1d684 100644
--- a/paddle/cinn/optim/buffer_assign.cc
+++ b/paddle/cinn/optim/buffer_assign.cc
@@ -90,7 +90,7 @@ std::map<std::string, ir::Tensor> InitialAssignBuffer(
   for (auto& item : buffer_updated_tensor) {
     auto* cur_n = uf_map[item.first];
     for (auto& other : stages[item.second]->meta.tensors_to_share_buffer_with) {
-      // we might intialize the buffer in args.
+      // we might initialize the buffer in args.
       auto* other_n = uf_map[other];
       if (!other_n) continue;
 
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
index 576d438280e34..f6b7c6f24e2b8 100644
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -187,7 +187,7 @@ struct SSANode : public cinn::common::GraphNode {
   static constexpr char *__type_info__ = "optim::SSANode";
 };
 
-// TODO(Superjomn) the graph here is not a SSA now, it is flattern for the
+// TODO(Superjomn) the graph here is not a SSA now, it is flatten for the
 // ir::CollectIRNodes method collects all the tensors recursively, so it can not
 // reserve the level information, fix it.
 struct SSABuilder : public ir::IRMutator<> {
diff --git a/paddle/cinn/optim/resize_buffer.cc b/paddle/cinn/optim/resize_buffer.cc
index c725d9d0c3c01..e73929a97aa57 100644
--- a/paddle/cinn/optim/resize_buffer.cc
+++ b/paddle/cinn/optim/resize_buffer.cc
@@ -153,7 +153,7 @@ class AnalyzeLoopVarRange : public ir::IRMutator<> {
 
     // We only use the maximal of var, maximal of Mod operation,
     // which may not be the maximal of index
-    // mathmetically, but it works for current CINN.
+    // mathematically, but it works for current CINN.
     //
     // We may add better computation of MaxIndexRange if we need
     for (int i = 0; i < vars.size(); ++i) {
diff --git a/paddle/cinn/optim/update_buffer_axis_pass.cc b/paddle/cinn/optim/update_buffer_axis_pass.cc
index d86f2e61a0019..b43b7fc834914 100644
--- a/paddle/cinn/optim/update_buffer_axis_pass.cc
+++ b/paddle/cinn/optim/update_buffer_axis_pass.cc
@@ -219,7 +219,7 @@ class ReplaceSameAxisToZero : public ir::IRMutator<> {
       for (auto p : buffer_name_access_same_index_expr_.at(buffer_name)) {
         int r = p.first;
         // After optimization, some load indice may be removed, so we need this
-        // conditioin
+        // condition
         if (indices->size() > r) {
           ir::ir_utils::IrReplace(
               &(indices->at(r)), indices->at(r), ir::Expr(0));
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 25530c74b2358..67e309c73a6a0 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -80,7 +80,7 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
   const int factor_;
   const absl::flat_hash_map<std::string, cinn::common::CasInterval>
       *var_intervals_;
-  // save (tensor name) -> (bool flag) to indentify whether tensors can be
+  // save (tensor name) -> (bool flag) to identify whether tensors can be
   // vectorized or not
   std::unordered_map<std::string, bool> tensor2flag_;
 
diff --git a/paddle/cinn/poly/poly_scheduler.cc b/paddle/cinn/poly/poly_scheduler.cc
index 8d0b02ca69b49..539be8221d8df 100644
--- a/paddle/cinn/poly/poly_scheduler.cc
+++ b/paddle/cinn/poly/poly_scheduler.cc
@@ -124,7 +124,7 @@ std::vector<Group> PartitionGraphByIterationDomain(cinn::common::Graph* graph) {
   }
 
   // NOTE DEBUG
-  // check there are same count of nodes both in the orginal graph and the
+  // check there are same count of nodes both in the original graph and the
   // groups.
   // @{
   int num_node_in_groups = 0;
diff --git a/paddle/cinn/poly/stage.h b/paddle/cinn/poly/stage.h
index ac36e5fd98e09..f9d2204312e81 100644
--- a/paddle/cinn/poly/stage.h
+++ b/paddle/cinn/poly/stage.h
@@ -59,7 +59,7 @@ struct StageForloopInfo {
   ir::DeviceAPI device;
 };
 
-//! Store the infomations about some other tensor `compute_at` this tensor.
+//! Store the informations about some other tensor `compute_at` this tensor.
 struct ComputeAtInfo {
   ComputeAtInfo(const std::string& consumer_tensor_name,
                 const std::string& producer_tensor_name,
@@ -277,7 +277,7 @@ class Stage : public Object {
    * \brief Mark the stage compute at the level of some other stage. Usually
    * used when there is no access relation between two tensors.
    *
-   * The difference bewteen ComputeAt2 and ComputeAt is that ComputeAt2 can be
+   * The difference between ComputeAt2 and ComputeAt is that ComputeAt2 can be
    * used when there is no access relation between two tensors.
    *
    * @param other the target stage to compute at.
@@ -373,7 +373,7 @@ class Stage : public Object {
   const isl::map& transform() const { return transform_; }
   isl::set transformed_domain() const;
 
-  // Dealing with the `ComputateAt` transform.
+  // Dealing with the `ComputeAt` transform.
   std::vector<ComputeAtRelation> compute_ats() const;
 
   //! Get the level-th dimensional name.
@@ -470,7 +470,7 @@ class Stage : public Object {
    */
   void InitTransform();
 
-  //! Lock the \p level-th axis and disallow the futher schedules on this axis.
+  //! Lock the \p level-th axis and disallow the further schedules on this axis.
   void LockAxis(uint32_t level);
   //! Unlock the \p level-th axis.
   void UnlockAxis(uint32_t level);
diff --git a/paddle/cinn/pybind/common.cc b/paddle/cinn/pybind/common.cc
index 80ff3abba928d..7d777af91204a 100644
--- a/paddle/cinn/pybind/common.cc
+++ b/paddle/cinn/pybind/common.cc
@@ -217,7 +217,7 @@ void BindShared(py::module *m) {
       .def("val", &cinn::common::RefCount::val);
 }
 
-// TODO(wanghaipeng03) using true_type or false_type as tag disptcher losses
+// TODO(wanghaipeng03) using true_type or false_type as tag dispatcher losses
 // semantic context
 template <typename T1, typename T2, typename F>
 inline auto __binary_op_fn_dispatch(T1 x, T2 y, F fn, std::true_type) {
diff --git a/paddle/cinn/runtime/custom_function_test.cc b/paddle/cinn/runtime/custom_function_test.cc
index 546599f252cc2..b2dc09b1862f0 100644
--- a/paddle/cinn/runtime/custom_function_test.cc
+++ b/paddle/cinn/runtime/custom_function_test.cc
@@ -132,7 +132,7 @@ TEST(CinnAssertTrue, test_true) {
 
   CinnBufferAllocHelper x(cinn_x86_device, cinn_bool_t(), {1});
 
-  // set inpute value true
+  // set input value true
   bool input_h = true;
   auto* input = x.mutable_data<bool>(target);
 
@@ -170,7 +170,7 @@ TEST(CinnAssertTrue, test_false_only_warning) {
 
   CinnBufferAllocHelper x(cinn_x86_device, cinn_bool_t(), {1});
 
-  // set inpute value false
+  // set input value false
   bool input_h = false;
   auto* input = x.mutable_data<bool>(target);
 
diff --git a/paddle/cinn/runtime/tiny_runtime.cc b/paddle/cinn/runtime/tiny_runtime.cc
index fc2a4693328c2..2f940007aed40 100644
--- a/paddle/cinn/runtime/tiny_runtime.cc
+++ b/paddle/cinn/runtime/tiny_runtime.cc
@@ -25,7 +25,7 @@
 
 extern "C" {
 int max_num_workers = std::thread::hardware_concurrency();
-// move to standlone file
+// move to standalone file
 struct param_context_t {
   int major_v;
   int minor_v;
diff --git a/paddle/cinn/utils/event.h b/paddle/cinn/utils/event.h
index 5d7b8113a1d8b..0ceccaa029d76 100644
--- a/paddle/cinn/utils/event.h
+++ b/paddle/cinn/utils/event.h
@@ -76,7 +76,7 @@ class Summary {
   struct Item {
     HostEvent info;
     Ratio sub_ratio{0.0};    // percentage of EventType
-    Ratio total_ratio{0.0};  // precentage of total process
+    Ratio total_ratio{0.0};  // percentage of total process
 
     explicit Item(const HostEvent& e) : info(e) {}
     bool operator<(const Item& other) const {
diff --git a/paddle/cinn/utils/multi_threading.h b/paddle/cinn/utils/multi_threading.h
index aff17e32bb762..6cf0afccbe311 100644
--- a/paddle/cinn/utils/multi_threading.h
+++ b/paddle/cinn/utils/multi_threading.h
@@ -43,7 +43,7 @@ class SequenceDispatcher : public JobDispatcher {
   int Next() const override;
 
  private:
-  // the maxmimum index of extent
+  // the maximum index of extent
   int end_;
   // the traversal step to the next one
   int step_;
diff --git a/paddle/cinn/utils/random_engine.h b/paddle/cinn/utils/random_engine.h
index 05162b288c781..49e8e6ecfd2a2 100644
--- a/paddle/cinn/utils/random_engine.h
+++ b/paddle/cinn/utils/random_engine.h
@@ -34,7 +34,7 @@ namespace utils {
 class LinearRandomEngine {
  public:
   using StateType = int64_t;
-  // the type name "resule_type" is needed by std::xxx_distribution
+  // the type name "result_type" is needed by std::xxx_distribution
   using result_type = uint32_t;
 
   // The minimum possible value of random state
diff --git a/paddle/cinn/utils/registry.h b/paddle/cinn/utils/registry.h
index 8f6aa8b5b0c7f..3958b19d84378 100644
--- a/paddle/cinn/utils/registry.h
+++ b/paddle/cinn/utils/registry.h
@@ -145,7 +145,7 @@ class Registry {
  * \endcode
  *
  * @tparam EntryType The type of subclass that inheritate the base.
- * @tparam FunctionType The function type this registry is registerd.
+ * @tparam FunctionType The function type this registry is registered.
  */
 template <typename EntryType, typename FunctionType>
 class FunctionRegEntryBase {
diff --git a/paddle/cinn/utils/string.h b/paddle/cinn/utils/string.h
index 900e1a6a2ed57..b891d3abb980d 100644
--- a/paddle/cinn/utils/string.h
+++ b/paddle/cinn/utils/string.h
@@ -31,7 +31,7 @@ std::string GetStreamCnt(const T& x);
  * Construct a formatted string with arguments.
  * @param fmt_str The format.
  * @param ... The parameters of the format.
- * @return The formated string.
+ * @return The formatted string.
  */
 std::string StringFormat(const std::string& fmt_str, ...);
 
diff --git a/paddle/fluid/distributed/ps/service/coordinator_client.cc b/paddle/fluid/distributed/ps/service/coordinator_client.cc
index c9c2ba49c9bf3..691b427d2bfde 100644
--- a/paddle/fluid/distributed/ps/service/coordinator_client.cc
+++ b/paddle/fluid/distributed/ps/service/coordinator_client.cc
@@ -44,7 +44,7 @@ void CoordinatorService::FLService(
   uint32_t from_client_id = request->client_id();
   VLOG(0) << "fl-ps > recv from client id: " << from_client_id
           << ", msg_type: " << msg_type;
-  // TODO(ziyoujiyi): find is not thread safe, beacuse of RB_Tree traversal
+  // TODO(ziyoujiyi): find is not thread safe, because of RB_Tree traversal
   auto itr = _service_handle_map.find(msg_type);
   if (itr == _service_handle_map.end()) {
     LOG(ERROR) << "fl-ps > unknown flClient2Coordinator msg type: " << msg_type;
diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 63a9967793976..ecebe9bcd3ac1 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -197,7 +197,7 @@ class PSClient {
   // server优雅退出
   virtual std::future<int32_t> StopServer() = 0;
 
-  // server profilera
+  // server profiler
   virtual std::future<int32_t> StartProfiler() = 0;
   virtual std::future<int32_t> StopProfiler() = 0;
 
diff --git a/paddle/fluid/distributed/ps/service/ps_graph_client.cc b/paddle/fluid/distributed/ps/service/ps_graph_client.cc
index 4bf084e109e22..e4ce866c8ebc5 100644
--- a/paddle/fluid/distributed/ps/service/ps_graph_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_graph_client.cc
@@ -73,7 +73,7 @@ void PsGraphClient::FinalizeWorker() {
   }
   simple::global_rpc_server().finalize();
 }
-// add maco
+// add macro
 #define DIM_PASS_ID(dim_id, pass_id) \
   uint32_t((uint32_t(dim_id) << 16) | pass_id)
 #define GET_PASS_ID(id) (id & 0xffff)
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.h b/paddle/fluid/distributed/ps/service/ps_local_client.h
index 61e9bf7a688f0..d05dab680039b 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.h
@@ -99,7 +99,7 @@ class PsLocalClient : public PSClient {
                                             size_t num);
 
   virtual ::std::future<int32_t> Flush();
-  // server profilera
+  // server profiler
   virtual std::future<int32_t> StartProfiler() {
     std::promise<int32_t> prom;
     std::future<int32_t> fut = prom.get_future();
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index f041143a80836..ff4035a39d30f 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -52,7 +52,7 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
     VLOG(0) << "table_name " << table_name << " mapping id " << idx;
     VLOG(0) << " feat name " << feat_name << " feat id" << feat_idx;
     if (static_cast<size_t>(feat_idx) < table_feat_conf_feat_name[idx].size()) {
-      // overide
+      // override
       table_feat_conf_feat_name[idx][feat_idx] = feat_name;
       table_feat_conf_feat_dtype[idx][feat_idx] = feat_dtype;
       table_feat_conf_feat_shape[idx][feat_idx] = feat_shape;
@@ -285,7 +285,7 @@ void GraphPyClient::load_edge_file(std::string name,
     status.wait();
   }
   // if (this->table_id_map.count(name)) {
-  //   VLOG(0) << "loadding data with type " << name << " from " << filepath;
+  //   VLOG(0) << "loading data with type " << name << " from " << filepath;
   //   uint32_t table_id = this->table_id_map[name];
   //   auto status =
   //       get_ps_client()->Load(table_id, std::string(filepath), params);
diff --git a/paddle/fluid/distributed/ps/table/depends/dense.h b/paddle/fluid/distributed/ps/table/depends/dense.h
index 14f99d8b41779..146b9de91ac87 100644
--- a/paddle/fluid/distributed/ps/table/depends/dense.h
+++ b/paddle/fluid/distributed/ps/table/depends/dense.h
@@ -29,7 +29,7 @@ namespace paddle {
 namespace distributed {
 
 // dense optimizer
-// TODO(tangwei12) integrate with sparse optimzer later.
+// TODO(tangwei12) integrate with sparse optimizer later.
 class DenseOptimizer {
  public:
   DenseOptimizer() {}
diff --git a/paddle/fluid/distributed/ps/table/tensor_table.h b/paddle/fluid/distributed/ps/table/tensor_table.h
index fa58ddfdd705d..5dd27d7298e5f 100644
--- a/paddle/fluid/distributed/ps/table/tensor_table.h
+++ b/paddle/fluid/distributed/ps/table/tensor_table.h
@@ -185,7 +185,7 @@ class GlobalStepTable : public DenseTensorTable {
     }
 
     if (main_program_id_ != -1) {
-      // Run main porgram, if program is used for learning decay
+      // Run main program, if program is used for learning decay
       auto main_program_desc = sub_program_->at(main_program_id_);
       auto main_ctx = executor_->Prepare(main_program_desc, 0);
       exec_context_ = std::move(main_ctx);
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index bcf822dc0156f..26207a9ad8c9e 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -103,7 +103,7 @@ void testFeatureNodeSerializeFloat64() {
   ASSERT_LE(eps * eps, 1e-5);
 }
 
-// void testSingleSampleNeighboor(
+// void testSingleSampleNeighbour(
 //     std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
 //   std::vector<std::vector<int64_t>> vs;
 //   std::vector<std::vector<float>> vs1;
diff --git a/paddle/fluid/eager/general_grad.h b/paddle/fluid/eager/general_grad.h
index 724b6938c28e2..443455619cae6 100644
--- a/paddle/fluid/eager/general_grad.h
+++ b/paddle/fluid/eager/general_grad.h
@@ -166,10 +166,10 @@ class GeneralGrad {
     }  // TODO(jiabin): May we need some check here.
   }
 
-  // Get Graph Info Betweent input target GradNode and outputs,
+  // Get Graph Info Between input target GradNode and outputs,
   // record depending_nodes_
   void GetGraphInfoBetweenTargets(const std::deque<GradNodeBase*>& init_queue) {
-    VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
+    VLOG(6) << "Running In GetGraphInfoBetweenTargets";
 
     // Copy nodes
     std::deque<GradNodeBase*> queue = init_queue;
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 17cb367e72980..fdebfbb1e3771 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -1518,7 +1518,7 @@ class PirGradNodeRunProgram : public egr::GradNodeBase {
             x.size(),
             x_grad_values.size()));
 
-    // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
+    // TODO(dev): Need an elegant way to determine information of grad_tensor,
     // such as: name, tensor type(DenseTensor or SelectedRows).
     for (size_t i = 0; i < x.size(); i++) {
       if (x[i].is_dense_tensor()) {
diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.h b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
index 1364c742890cc..b48b379339500 100644
--- a/paddle/fluid/framework/details/fetch_barrier_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.h
@@ -39,7 +39,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-// **NOTE**: fetch_barrier op is special it outputs all recved variables on
+// **NOTE**: fetch_barrier op is special it outputs all received variables on
 // all places if there are multiple places, must init with
 // multiple dev_ctxes_ !!!!
 
diff --git a/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc b/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
index 9ca4d4f482f08..3a9a2c81889ee 100644
--- a/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
+++ b/paddle/fluid/framework/ir/transfer_layout_elim_pass.cc
@@ -156,7 +156,7 @@ bool TransferLayoutElimPass::AllInputIsTransferlayout(
 
   for (auto var : op_node->inputs) {
     // If this input is a 1D persistable tensor，we allow transfer_layout not
-    // appear before this var, but temporarily diasble this if.
+    // appear before this var, but temporarily disable this if.
     if (var->Var()->Persistable() && false) {
       auto var_dims =
           scope->FindVar(var->Name())->GetMutable<phi::DenseTensor>()->dims();
diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
index 929b20cb27752..3bc5893a162b3 100644
--- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
@@ -60,7 +60,7 @@ bool DataTransferHelper::apply(const phi::KernelKey& kernel_type_for_var,
     is_transferred = true;
   }
 
-  // 2. dype transform
+  // 2. dtype transform
   if (need_dtype_transform(kernel_type_for_var, expected_kernel_key)) {
     auto op = TransferDtype(
         *src_var_name,
@@ -168,7 +168,7 @@ void DataTransferHelper::RunAndConstructOpFuncNode(
     // their implementations are device-related.
     // For example, consider changing the layout of a gpu tensor
     // while the gpu kernel of transfer_layout op does not exist.
-    // To use the cpu kernel, you must insert memcpy_d2h/mepcpy_h2d op
+    // To use the cpu kernel, you must insert memcpy_d2h/memcpy_h2d op
     // in addition. But such operation should not be done here.
     // Maybe in future we will support this.
   }
@@ -357,7 +357,7 @@ std::shared_ptr<OperatorBase> TransferDtype(const std::string& var_name,
   AttributeMap attr_map;
   attr_map["in_dtype"] = static_cast<int>(in_dtype);
   attr_map["out_dtype"] = static_cast<int>(out_dtype);
-  // NOTE(Aurelius84): In whice case use_mkldnn = true?
+  // NOTE(Aurelius84): In which case use_mkldnn = true?
   attr_map["use_mkldnn"] = false;
 
   // 3. Create transfer_dtype_op
@@ -532,7 +532,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
               // for some situation like InferShape().
               // In this situation We cannot skip Var analysis, as
               // MKL-DNN shape of Var may differ from kNHWC Var
-              // In such situation corressponding resized Var
+              // In such situation corresponding resized Var
               // has to be created and registered
               if ((tensor_in->layout() == DataLayout::ONEDNN) &&
                   (var->IsType<phi::DenseTensor>() == true) &&
@@ -736,7 +736,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
   }
 
   if (transfered) {
-    // NOTE(zhiqiu): UPDATE the corresponding OeratorBase to make it consistent
+    // NOTE(zhiqiu): UPDATE the corresponding OperatorBase to make it consistent
     // with instruction.
     op_base->Inputs() = new_ins;
     op_base->Outputs() = new_outs;
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 49b2ed3b73f96..df82aedfcec5f 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -348,7 +348,7 @@ static constexpr char kFetchVarName[] = "fetch";
 
 // static_ref_ is the numer of last live ops calculated to statically after
 // `build` the Instructions. dynamic_ref_  is the runtime version ref which will
-// be decreased by one dynamiclly after the execution of an op (in last ops
+// be decreased by one dynamically after the execution of an op (in last ops
 // list). var_ is the related variable
 
 // The dynamic_ref_ is initialized to static_ref_ first, and is decreased to 1
@@ -379,7 +379,7 @@ class VarRefInfo {
 
 // static_dep_ is the numer of dependencies (ops that must run before it) of
 // each op which is calculated to statically. static_dep_  is the runtime
-// version dep which will be decreased by one dynamiclly after the execution of
+// version dep which will be decreased by one dynamically after the execution of
 // one dependency op.
 
 // The dynamic_dep_ is initialized to static_dep_ first, and is decreased to 1
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index fcb20b2a1109a..5192e8c773888 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -251,7 +251,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
 #endif
       }
     }
-    // TODO(zhouwei): It's better to free memory of grad by grad_t->claer.
+    // TODO(zhouwei): It's better to free memory of grad by grad_t->clear.
     // But will have some bug on mac CPU of yolov3 model, why?
     // After fix this bug, function SetIsEmpty() isn't need
     grad_var_->SharedVar()->SetIsEmpty(true);
diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
index 87748ca5d102e..349d2f5b5eb36 100644
--- a/paddle/fluid/imperative/layout_transformer.h
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -337,7 +337,7 @@ class TransposeOpTransformer
     auto desired_layout = LayoutAutoTune::Instance().GetDesiredLayout();
     if (var_layout == desired_layout && desired_layout == DataLayout::NHWC) {
       auto axis = PADDLE_GET_CONST(std::vector<int>, (*attrs)["axis"]);
-      // NHWC->NCHW, permutaion will be set as follows.
+      // NHWC->NCHW, permutation will be set as follows.
       std::vector<int> perm = {0, 3, 1, 2};
       // fuse the transpose Ops by transforming axis.
       std::vector<int> fusion_axis = {
diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h
index e0fd05562a413..2d15ed51e58a1 100644
--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
@@ -65,7 +65,7 @@ class ParallelContext {
   // if CPU, should do nothing.
   virtual void WaitComm(int ring_id) = 0;
 
-  // synchorize compute stream
+  // synchronize compute stream
   virtual void SynchronizeCompute() = 0;
 
   inline int GetNRings() const { return strategy_.nrings_; }
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 3fd37d5ec3674..0a5d44a1e1e57 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -325,7 +325,7 @@ static void FillConstantLike(const VariableWrapper &ref_var,
   auto *dst_tensor = dst_var->MutableVar()->GetMutable<phi::DenseTensor>();
   auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
   dst_tensor->Resize(ref_tensor.dims());
-  // TOOD(jiabin): Ugly fix here we have fwd_data_type_ and data_type, since in
+  // TODO(jiabin): Ugly fix here we have fwd_data_type_ and data_type, since in
   // grad mission
   // we can't get data_type_ directly. We need to check if we can only use
   // default data_type for now.
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index b52a40d29ff26..33c37042aac43 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -155,7 +155,7 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   ///
   /// \brief Get the Output Tensor object
   ///
-  /// \param[in] name otuput name
+  /// \param[in] name output name
   /// \return output tensor
   ///
   std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 2828fd65a6ee7..8c66b66363603 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -167,7 +167,7 @@ struct PD_INFER_DECL PaddleTensor {
 /// to device,
 /// eliminating additional CPU copy. ZeroCopyTensor is only used in the
 /// AnalysisPredictor.
-/// It is obtained through PaddlePredictor::GetinputTensor()
+/// It is obtained through PaddlePredictor::GetInputTensor()
 /// and PaddlePredictor::GetOutputTensor() interface.
 
 class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
@@ -467,7 +467,7 @@ PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
 
 }  // namespace paddle
 
-// forward declation
+// forward declaration
 using cudaStream_t = struct CUstream_st*;
 using hipStream_t = struct ihipStream_t*;
 
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 22cd023c1fee2..0817f0a1b9919 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -78,7 +78,7 @@ enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW };
 /// to device,
 /// eliminating additional CPU copy. Tensor is only used in the
 /// AnalysisPredictor.
-/// It is obtained through PaddlePredictor::GetinputTensor()
+/// It is obtained through PaddlePredictor::GetInputTensor()
 /// and PaddlePredictor::GetOutputTensor() interface.
 class PD_INFER_DECL Tensor {
  public:
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
index 92d5d0e3a6308..427e9b95ac499 100644
--- a/paddle/fluid/inference/capi_exp/pd_config.h
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -46,7 +46,7 @@ PADDLE_CAPI_EXPORT extern __pd_give PD_Config* PD_ConfigCreate();
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigDestroy(__pd_take PD_Config* pd_config);
 ///
-/// \brief Set the combined model with two specific pathes for program and
+/// \brief Set the combined model with two specific paths for program and
 /// parameters.
 ///
 /// \param[in] pd_config config
@@ -169,7 +169,7 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime(
 PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime(
     __pd_keep PD_Config* pd_config);
 ///
-/// \brief A boolean state telling whether the ONNXRutnime is turned on.
+/// \brief A boolean state telling whether the ONNXRuntime is turned on.
 ///
 /// \return Whether the ONNXRuntime is turned on.
 ///
@@ -238,11 +238,11 @@ PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
 PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
     __pd_keep PD_Config* pd_config);
 ///
-/// \brief Turn on custome device.
+/// \brief Turn on custom device.
 ///
 /// \param[in] pd_config config
 /// \param[in] device_type device type
-/// \param[in] device_id device_id the custome device card to use.
+/// \param[in] device_id device_id the custom device card to use.
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigEnableCustomDevice(
     __pd_keep PD_Config* pd_config, char* device_type, int32_t device_id);
@@ -306,13 +306,13 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigCudnnEnabled(
 /// If turned off, the AnalysisConfig will act just like a NativeConfig.
 ///
 /// \param[in] pd_config config
-/// \param[in] x Whether the ir graph optimization is actived.
+/// \param[in] x Whether the ir graph optimization is activated.
 ///
 PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrOptim(
     __pd_keep PD_Config* pd_config, PD_Bool x);
 ///
 /// \brief A boolean state telling whether the ir graph optimization is
-/// actived.
+/// activated.
 ///
 /// \param[in] pd_config config
 /// \return Whether to use ir graph optimization.
@@ -321,7 +321,7 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim(
     __pd_keep PD_Config* pd_config);
 ///
 /// \brief Turn on the TensorRT engine.
-/// The TensorRT engine will accelerate some subgraphes in the original Fluid
+/// The TensorRT engine will accelerate some subgraphs in the original Fluid
 /// computation graph. In some models such as resnet50, GoogleNet and so on,
 /// it gains significant performance acceleration.
 ///
@@ -330,7 +330,7 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim(
 /// workspace.
 /// \param[in] max_batch_size The maximum batch size of this prediction task,
 /// better set as small as possible for less performance loss.
-/// \param[in] min_subgrpah_size The minimum TensorRT subgraph size needed, if a
+/// \param[in] min_subgraph_size The minimum TensorRT subgraph size needed, if a
 /// subgraph is smaller than this, it will not be transferred to TensorRT
 /// engine.
 /// \param[in] precision The precision used in TensorRT.
@@ -490,7 +490,7 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtDlaEnabled(
 /// \brief Turn on the usage of Lite sub-graph engine.
 ///
 /// \param[in] pd_config config
-/// \param[in] precision Precion used in Lite sub-graph engine.
+/// \param[in] precision Precision used in Lite sub-graph engine.
 /// \param[in] zero_copy whether use zero copy.
 /// \param[in] passes_filter_num The number of passes used in Lite sub-graph
 /// engine.
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h
index a35defb910070..f17d78f9155af 100644
--- a/paddle/fluid/inference/capi_exp/pd_predictor.h
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.h
@@ -40,7 +40,7 @@ extern "C" {
 /// \brief Create a new Predictor
 ///
 /// \param[in] Config config
-/// \return new predicor.
+/// \return new predictor.
 ///
 PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorCreate(
     __pd_take PD_Config* pd_config);
diff --git a/paddle/fluid/ir_adaptor/translator/op_compat_info.h b/paddle/fluid/ir_adaptor/translator/op_compat_info.h
index fa13415ffdfd6..bcd56ac07da92 100644
--- a/paddle/fluid/ir_adaptor/translator/op_compat_info.h
+++ b/paddle/fluid/ir_adaptor/translator/op_compat_info.h
@@ -126,7 +126,7 @@ class OpNameNormalizer {
         return ret.value();
       }
     } else if (is_grad_op && !is_grad_arg) {
-      // backwward op using forward args: like trace_grad using forward input
+      // backward op using forward args: like trace_grad using forward input
       size_t type_pos = op_type.find(kPhiGradSuffix);
       if (auto ret = GetDirectMapping(op_type.substr(0, type_pos), arg_name)) {
         VLOG(10) << "[" << op_type << "] found " << ret.value();
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 9295e5c643e5f..3acb0f4fc0718 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -556,7 +556,7 @@ std::vector<pir::Value> OpTranscriber::GenerateOperationInput(
       auto defining_info = (*param_map)[legacy_input_vars[0]];
       op_inputs.push_back(defining_info.value);
 
-      // if src type is Vector<Tesnor> , need an additional `CombineOp` to
+      // if src type is Vector<Tensor> , need an additional `CombineOp` to
       // assemble them.
     } else {
       auto* combine_op = InsertCombineOperationForTarget(
@@ -654,7 +654,7 @@ OpTranscriber::GenerateOperationOutput(pir::IrContext* ctx,
       arg_to_idx[var_name] = {cur_output_idx, 0};
       op_output_types.push_back(translated_var_type);
 
-      // if src type is Vector<Tesnor>
+      // if src type is Vector<Tensor>
     } else {
       VLOG(10) << "[output translating]"
                << "[" << op_desc.Type() << "]" << info.name << " :"
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.h b/paddle/fluid/ir_adaptor/translator/program_translator.h
index f2c4096113a8e..c335c36670c6d 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.h
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.h
@@ -74,7 +74,7 @@ class TranslationContext {
   Container container_;
   TranslationContext* parent_ = nullptr;
   std::vector<std::unique_ptr<TranslationContext>>
-      sons_;  // used to seperate different block
+      sons_;  // used to separate different block
 };
 
 class ProgramTranslator {
@@ -100,11 +100,11 @@ class ProgramTranslator {
   std::unordered_map<std::string, VarDesc*> parameter_name_mappings_;
   std::unordered_set<std::string> parameter_visited_;
 
-  /// In the legacy program desc, there are two special named varibales:
+  /// In the legacy program desc, there are two special named variables:
   /// 1. "feed", the input variable of feed op
   /// 2. "fetch", the output variable of fetch op
   /// However, new feed has no input and new fetch has no output
-  /// So we don't handle these two vairables when
+  /// So we don't handle these two variables when
   /// `Get/SetParameterFromSingleBlock`
   static const std::unordered_set<std::string> no_cast_var_names;
 
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 9cd2c89eda866..f82ec0cbcdf1d 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -251,7 +251,7 @@ static phi::Backend DeriveBackend(const std::string& op,
                                   const OpYamlInfoParser* op_info_parser,
                                   phi::Backend kernel_backend,
                                   size_t input_index) {
-  // NOTE: Parameters are initilizered on executor place defined
+  // NOTE: Parameters are initialized on executor place defined
   if ((op.compare(pir::SetParameterOp::name()) == 0 ||
        op.compare(pir::ShadowOutputOp::name()) == 0) &&
       place.GetType() == phi::AllocationType::GPU) {
@@ -958,7 +958,7 @@ phi::KernelKey GetKernelKey(
   phi::DataType kernel_dtype = phi::DataType::UNDEFINED;
 
   if (op_info_parser != nullptr) {
-    // only suppurt non vector input for now
+    // only support non vector input for now
     int tensor_input_number =
         static_cast<int>(op_info_parser->InputTensorNumber());
     VLOG(8) << "Begin to infer kernel key from op_info_parser(defined by yaml "
@@ -1018,7 +1018,7 @@ phi::KernelKey GetKernelKey(
       // Because we can't make sure the place when build data op
       // and the output place of data op is undefined. It means we
       // don't know how to select the kernel in the next of op that
-      // uses data op outout as inputs. So, we need set kernel backend
+      // uses data op output as inputs. So, we need set kernel backend
       // manually.
       auto op_res = input_tmp.dyn_cast<pir::OpResult>();
       if (!op_res) {
@@ -2089,7 +2089,7 @@ std::vector<pir::Value> BuildInputs(
               new_in, out_type, in_place, out_place, kernel_key, block);
         }
       } else if (new_in_type.isa<pir::VectorType>()) {
-        // [ todo need update here, support combine data transfomer]
+        // [ todo need update here, support combine data transformer]
         // deal with pre combine op
         auto pre_define_op = cur_in.defining_op();
         if (pre_define_op->isa<::pir::CombineOp>()) {
diff --git a/paddle/fluid/pir/transforms/transform_general_functions.h b/paddle/fluid/pir/transforms/transform_general_functions.h
index cec04f3712990..8b9ffdd8cf477 100644
--- a/paddle/fluid/pir/transforms/transform_general_functions.h
+++ b/paddle/fluid/pir/transforms/transform_general_functions.h
@@ -24,7 +24,7 @@
 namespace pir {
 
 /**
- * @brief Get the name of pararmeter from a value.
+ * @brief Get the name of parameter from a value.
  *
  * @note The value must be a output of a ParameterOp or a ConstantTensorOp.
  *
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 2094fef07a873..9dc8897a10a41 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -2208,7 +2208,7 @@ PyDoc_STRVAR(tensor_method_indices__doc__,
 Returns the indices of non zero elements in input SparseCooTensor.
 
 Returns:
-    DenseTesnor
+    DenseTensor
 
 Examples:
 
@@ -2252,7 +2252,7 @@ PyDoc_STRVAR(tensor_method_values__doc__,
 Returns the values of non zero elements in input SparseCooTensor.
 
 Returns:
-    DenseTesnor
+    DenseTensor
 
 Examples:
 
@@ -2305,7 +2305,7 @@ PyDoc_STRVAR(tensor_method_crows__doc__,
 Returns the compressed row index of non zero elements in input SparseCsrTensor.
 
 Returns:
-    DenseTesnor
+    DenseTensor
 
 Examples:
 
@@ -2349,7 +2349,7 @@ PyDoc_STRVAR(tensor_method_cols__doc__,
 Returns the column index of non zero elements in input SparseCsrTensor.
 
 Returns:
-    DenseTesnor
+    DenseTensor
 
 Examples:
 
diff --git a/paddle/fluid/pybind/pir.cc b/paddle/fluid/pybind/pir.cc
index 3a0de137173a7..723ff501450c0 100644
--- a/paddle/fluid/pybind/pir.cc
+++ b/paddle/fluid/pybind/pir.cc
@@ -1138,7 +1138,7 @@ SplitedResult SplitForwardBackward(
       });
   auto &forward_value_map = forward_mapper.GetMutableMap<pir::Value>();
 
-  // backward program construc.
+  // backward program construct.
   // Step1. insert data op for inputs_values and middle_values
   pir::IrMapping backward_mapper;
   auto &backward_value_map = backward_mapper.GetMutableMap<pir::Value>();
@@ -1160,7 +1160,7 @@ SplitedResult SplitForwardBackward(
     if (v.impl() == nullptr) {
       return;
     }
-    // NOTE(Aurelius84): we should skip insert ShadowOutputOp repeatly by
+    // NOTE(Aurelius84): we should skip insert ShadowOutputOp repeatedly by
     // calling SplitForwardBackward multi-times.
     std::string shadow_output_name =
         std::string("output_") + std::to_string(counter);
diff --git a/paddle/phi/api/yaml/generator/backward_api_gen.py b/paddle/phi/api/yaml/generator/backward_api_gen.py
index e2c58a30dd9a7..2201d74093b90 100644
--- a/paddle/phi/api/yaml/generator/backward_api_gen.py
+++ b/paddle/phi/api/yaml/generator/backward_api_gen.py
@@ -345,7 +345,7 @@ def generate_backward_api(
         source_include(include_header_file, include_fw_header_file)
     )
     source_file.write(namespace[0])
-    # not all fused ops supoort dygraph
+    # not all fused ops support dygraph
     if is_fused_backward_yaml is True:
         new_bw_apis = [
             bw_api
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index 70c12aa9f8d42..0d2d4d16a2b63 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -493,7 +493,7 @@ def generate_backward_api(
         source_include(include_header_file, include_fw_header_file)
     )
     source_file.write(namespace[0])
-    # not all fused ops supoort dygraph
+    # not all fused ops support dygraph
     if is_fused_backward_yaml is True:
         new_bw_apis = [
             bw_api
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_attr.h b/paddle/phi/core/distributed/auto_parallel/dist_attr.h
index e4016b9f65cdc..a3e05c9fcdacb 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_attr.h
+++ b/paddle/phi/core/distributed/auto_parallel/dist_attr.h
@@ -206,7 +206,7 @@ class TEST_API TensorDistAttr {
   std::map<std::string, bool> annotated_;
   int64_t chunk_id_{0};
   // partial map would be small (less than mesh.size)
-  // iterate operation (copy and comparision) would more frequency than random
+  // iterate operation (copy and comparison) would more frequency than random
   // element access. <key: dim on mesh, value: reduce type>
   paddle::flat_hash_map<int64_t, ReduceType> partial_status_;
 };
diff --git a/paddle/phi/infermeta/spmd_rules/concat.cc b/paddle/phi/infermeta/spmd_rules/concat.cc
index 666e5a8bdea3c..4e3c2ead16983 100644
--- a/paddle/phi/infermeta/spmd_rules/concat.cc
+++ b/paddle/phi/infermeta/spmd_rules/concat.cc
@@ -74,7 +74,7 @@ SpmdInfo ConcatInferSpmd(const std::vector<DistMetaTensor>& x, int axis) {
       });
   auto non_empty_index = non_empty_iter - tensor_shapes.begin();
   int64_t ndim = static_cast<int64_t>(tensor_shapes[non_empty_index].size());
-  // normlize dim
+  // normalize dim
   auto dim = axis < 0 ? ndim + axis : axis;
   std::vector<TensorDistAttr> input_attrs;
   std::transform(
@@ -138,7 +138,7 @@ SpmdInfo ConcatGradInferSpmdDynamic(const std::vector<DistMetaTensor>& x,
   auto non_empty_index = non_empty_iter - tensor_shapes.begin();
   int64_t ndim = static_cast<int64_t>(tensor_shapes[non_empty_index].size());
   auto dim = axis.to<int64_t>();
-  // normlize dim
+  // normalize dim
   dim = dim < 0 ? ndim + dim : dim;
   std::vector<TensorDistAttr> input_attrs;
   std::transform(
diff --git a/paddle/phi/infermeta/spmd_rules/cross_entropy_with_softmax.cc b/paddle/phi/infermeta/spmd_rules/cross_entropy_with_softmax.cc
index 95ce5a6ecf7ff..21dc00ac1fc18 100644
--- a/paddle/phi/infermeta/spmd_rules/cross_entropy_with_softmax.cc
+++ b/paddle/phi/infermeta/spmd_rules/cross_entropy_with_softmax.cc
@@ -142,7 +142,7 @@ SpmdInfo CrossEntropyWithSoftmaxInferSpmdBase(const DistMetaTensor& x,
                            &softmax_out_axes_dst,
                            support_shard_softmax_dim);
 
-  // Step2: Sharding Propogation
+  // Step2: Sharding Propagation
   // Step2.1: merge input shardings
   std::unordered_map<std::string, int64_t> axis_to_dim_map =
       ShardingMergeForTensors({{x_axes_src, x_dims_mapping_src},
@@ -189,8 +189,8 @@ SpmdInfo CrossEntropyWithSoftmaxInferSpmdBase(const DistMetaTensor& x,
   // todo if softmax_normalize axis is sharded, notify downstream phi api to
   // select c_softmax_with_entropy_kernel.
 
-  // according to the phi api implemetation, the softmax_out tensor will alway
-  // be genereated not matter the value of use_softmax.
+  // according to the phi api implementation, the softmax_out tensor will alway
+  // be generated not matter the value of use_softmax.
   return {{x_dist_attr_dst, label_dist_attr_dst},
           {softmax_out_dist_attr_dst, loss_dist_attr_dst}};
 }
@@ -290,7 +290,7 @@ SpmdInfo CrossEntropyWithSoftmaxInferSpmdReverse(
                            &softmax_out_axes_dst,
                            true);
 
-  // Step2: Sharding Propogation
+  // Step2: Sharding Propagation
   // Step2.1 merge output dims mappings
   std::unordered_map<std::string, int64_t> axis_to_dim_map =
       ShardingMergeForTensors({{loss_axes, loss_dims_mapping_src},
@@ -363,8 +363,8 @@ SpmdInfo CrossEntropyWithSoftmaxInferSpmdReverse(
           << str_join(x_dims_mapping) << "]\nLabel dims_mapping: ["
           << str_join(label_dims_mapping) << "]\n\n";
 
-  // according to the phi api implemetation, the softmax_out tensor will alway
-  // be genereated not matter the value of use_softmax.
+  // according to the phi api implementation, the softmax_out tensor will alway
+  // be generated not matter the value of use_softmax.
   return {{x_dist_attr, label_dist_attr},
           {s_out_dist_attr_dst, loss_dist_attr_dst}};
 }
diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
index 3e0e4c7a3d7a5..a48d05b8d783e 100644
--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -1215,7 +1215,7 @@ void RnnGradFunc(const CPUContext& dev_ctx,
             gate_num_tmp);
     }
 
-    // calcluate the dropout gradient for the layer_x_grad_holder
+    // calculate the dropout gradient for the layer_x_grad_holder
     // dropout_state save in the forward process
     if (i > 0) {
       if ((!is_test) && (dropout_prob != 0)) {
diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
index 20eb2c29882f7..1d79f7756ae3d 100644
--- a/paddle/phi/kernels/gpu/mode_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu
@@ -73,7 +73,7 @@ void ModeGradKernel(const Context& dev_ctx,
   int pre, n, post;
   funcs::GetDims(in_dims, axis, &pre, &n, &post);
 
-  // calcluate the block and grid num
+  // calculate the block and grid num
   int block_size = funcs::ComputeBlockSize(post);
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1);
diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu
index 653bd241e72b7..16ed4b9349019 100644
--- a/paddle/phi/kernels/gpu/mode_kernel.cu
+++ b/paddle/phi/kernels/gpu/mode_kernel.cu
@@ -38,7 +38,7 @@ void ModeKernel(const Context& dev_ctx,
                       errors::InvalidArgument(
                           "The dims of Input(X) should be greater than 0."));
   }
-  // calcluate the real axis
+  // calculate the real axis
   if (axis < 0) axis += in_dims.size();
 
   auto out_dims = out->dims();
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
index 6c2e880e9a9ef..e34bc5f9f6e5a 100644
--- a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -55,7 +55,7 @@ void TopkGradKernel(const Context& dev_ctx,
   int pre, n, post;
   phi::funcs::GetDims(in_dims, axis, &pre, &n, &post);
 
-  // calcluate the block and grid num
+  // calculate the block and grid num
   auto ComputeBlockSize = [](int col) {
     if (col > 512)
       return 1024;
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index 354f104e48681..1d93ef1a2790f 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -73,7 +73,7 @@ void TopkKernel(const Context& dev_ctx,
     phi::funcs::set_constant(dev_ctx, indices, static_cast<int64_t>(0));
     return;
   }
-  // calcluate the real axis
+  // calculate the real axis
   if (axis < 0) axis += in_dims.size();
 
   int k = k_scalar.to<int>();
@@ -255,7 +255,7 @@ void TopkKernel(const Context& dev_ctx,
     int ndims = trans.size();
     funcs::TransCompute<phi::GPUContext, T>(
         ndims, dev_ctx, *input, &trans_input, trans);
-    // third step, calcluate the topk
+    // third step, calculate the topk
     // allocate the tmp cuda memory for the tmp result
     DenseTensor trans_ind;
     DenseTensor trans_out;

From 9222271cc1a7261d943b07b8e1952245f48a49d4 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:25:40 +0800
Subject: [PATCH 23/82] Update multi_threading_test.cc (#61998)

---
 paddle/cinn/utils/multi_threading_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/cinn/utils/multi_threading_test.cc b/paddle/cinn/utils/multi_threading_test.cc
index abd429a4b1677..bd081fea2b56c 100644
--- a/paddle/cinn/utils/multi_threading_test.cc
+++ b/paddle/cinn/utils/multi_threading_test.cc
@@ -34,19 +34,19 @@ TEST(JobDispatcher, SequenceDispatcher) {
 
 TEST(parallel_run, Basic) {
   std::vector<int> results(100, -1);
-  auto woker_fn = [&results](int index) {
+  auto worker_fn = [&results](int index) {
     CHECK_LT(index, results.size()) << "index invalid";
     results[index] = index;
   };
   // check process every index in the extent of [0, 100) with step 1
-  parallel_run(woker_fn, SequenceDispatcher(0, 100), 2);
+  parallel_run(worker_fn, SequenceDispatcher(0, 100), 2);
   for (int i = 0; i < 100; ++i) {
     ASSERT_EQ(results[i], i);
   }
 
   // check only indexes in the extent of [0, 100) with step 3 are processed
   results.assign(100, -1);
-  parallel_run(woker_fn, SequenceDispatcher(0, 100, 3), 3);
+  parallel_run(worker_fn, SequenceDispatcher(0, 100, 3), 3);
   for (int i = 0; i < 100; ++i) {
     if (i % 3 == 0) {
       ASSERT_EQ(results[i], i);

From 5d79cdada70f2d6c3cddaed77b50fd89bcd29f05 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:28:34 +0800
Subject: [PATCH 24/82] Add WITH_GPU check to copy FLASHATTN_LIBRARIES (#61961)

* Fix

* ci

* ci
---
 python/setup.py.in |  7 ++++---
 setup.py           | 11 ++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index f3d80419ebeac..520a9f7f7a56c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -685,9 +685,10 @@ if not sys.platform.startswith("linux"):
     package_data['paddle.libs']+=[os.path.basename('${GNU_RT_LIB_2}')]
     shutil.copy('${GNU_RT_LIB_2}', libs_path)
 
-if len('${FLASHATTN_LIBRARIES}') > 1:
-    package_data['paddle.libs']+=[os.path.basename('${FLASHATTN_LIBRARIES}')]
-    shutil.copy('${FLASHATTN_LIBRARIES}', libs_path)
+if '${WITH_GPU}' == 'ON':
+    if len('${FLASHATTN_LIBRARIES}') > 1:
+        package_data['paddle.libs']+=[os.path.basename('${FLASHATTN_LIBRARIES}')]
+        shutil.copy('${FLASHATTN_LIBRARIES}', libs_path)
 
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
diff --git a/setup.py b/setup.py
index 18dc54b6fd140..350c62bdf6301 100644
--- a/setup.py
+++ b/setup.py
@@ -1064,11 +1064,12 @@ def get_package_data_and_package_dir():
                 shutil.copy(env_dict.get("OPENBLAS_LIB") + '.0', libs_path)
                 package_data['paddle.libs'] += ['libopenblas.so.0']
 
-    if len(env_dict.get("FLASHATTN_LIBRARIES", "")) > 1:
-        package_data['paddle.libs'] += [
-            os.path.basename(env_dict.get("FLASHATTN_LIBRARIES"))
-        ]
-        shutil.copy(env_dict.get("FLASHATTN_LIBRARIES"), libs_path)
+    if env_dict.get("WITH_GPU") == 'ON':
+        if len(env_dict.get("FLASHATTN_LIBRARIES", "")) > 1:
+            package_data['paddle.libs'] += [
+                os.path.basename(env_dict.get("FLASHATTN_LIBRARIES"))
+            ]
+            shutil.copy(env_dict.get("FLASHATTN_LIBRARIES"), libs_path)
     if env_dict.get("WITH_LITE") == 'ON':
         shutil.copy(env_dict.get("LITE_SHARED_LIB"), libs_path)
         package_data['paddle.libs'] += [

From 3f00547fb9bbe9779e268e52e65f85b0374b5e97 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:29:40 +0800
Subject: [PATCH 25/82]  Fix some typos (dst_strategys, etc.) (#62003)

---
 .../auto_parallel/static/auto_align_tool.py    | 12 ++++++------
 .../auto_parallel/static/completion.py         |  2 +-
 .../auto_parallel/static/converter.py          |  6 +++---
 .../auto_parallel/static/cost/base_cost.py     |  2 +-
 .../auto_parallel/static/cost_model.py         | 10 +++++-----
 .../distributed/auto_parallel/static/helper.py |  4 ++--
 .../auto_parallel/static/parallelizer_v2.py    |  2 +-
 .../fleet/base/distributed_strategy.py         |  2 +-
 python/paddle/distributed/fleet/fleet.py       |  4 ++--
 python/paddle/distributed/fleet/launch.py      |  2 +-
 .../paddle/distributed/fleet/launch_utils.py   | 14 +++++++-------
 .../hybrid_parallel_gradscaler.py              |  2 +-
 .../fleet/meta_optimizers/sharding/utils.py    |  2 +-
 .../sharding/weight_decay_helper.py            |  2 +-
 .../meta_optimizers/sharding_optimizer.py      | 18 +++++++++---------
 .../fleet/meta_parallel/pipeline_parallel.py   |  4 ++--
 .../sharding/group_sharded_utils.py            |  2 +-
 .../distributed/fleet/recompute/recompute.py   | 16 ++++++++--------
 .../fleet/recompute/recompute_hybrid.py        |  4 ++--
 .../fleet/runtime/parameter_server_runtime.py  | 14 +++++++-------
 .../distributed/fleet/runtime/the_one_ps.py    |  8 ++++----
 python/paddle/distributed/fleet/scaler.py      |  2 +-
 python/paddle/distributed/fleet/utils/fs.py    |  8 ++++----
 .../fleet/utils/hybrid_parallel_inference.py   |  4 ++--
 .../fleet/utils/hybrid_parallel_util.py        |  4 ++--
 .../fleet/utils/mix_precision_utils.py         |  2 +-
 .../fleet/utils/tensor_fusion_helper.py        |  2 +-
 .../fleet/utils/tensor_parallel_utils.py       |  8 ++++----
 .../distributed/launch/context/__init__.py     |  4 ++--
 .../launch/controllers/controller.py           |  2 +-
 python/paddle/distributed/models/moe/utils.py  |  2 +-
 ...auto_parallel_data_parallel_optimization.py |  2 +-
 .../passes/auto_parallel_recompute.py          | 16 ++++++++--------
 .../passes/auto_parallel_sharding.py           |  4 ++--
 .../distributed/passes/ps_trainer_pass.py      |  4 ++--
 .../ps/utils/collective_transpiler.py          |  4 ++--
 36 files changed, 100 insertions(+), 100 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
index d7d98f75d80f1..b1ced07b8b24e 100644
--- a/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
+++ b/python/paddle/distributed/auto_parallel/static/auto_align_tool.py
@@ -352,13 +352,13 @@ def convert_src_tensor_2_dst_tensor(vars_list, src_attr_map, dst_attr_map):
         if src_attr_map is None or len(src_attr_map) == 0:
             return vars_list[0]
 
-        dst_strategys = {}
-        src_strategys = {}
+        dst_strategies = {}
+        src_strategies = {}
         tensors_dict = {}
 
         convert_tensor_dict = None
         for var_name in src_attr_map.keys():
-            assert var_name not in dst_strategys
+            assert var_name not in dst_strategies
             dist_vars = []
             for vars in vars_list:
                 if var_name in vars.keys():
@@ -367,13 +367,13 @@ def convert_src_tensor_2_dst_tensor(vars_list, src_attr_map, dst_attr_map):
                 continue
 
             if var_name in dst_attr_map and var_name in src_attr_map:
-                dst_strategys[var_name] = copy.deepcopy(dst_attr_map[var_name])
-                src_strategys[var_name] = copy.deepcopy(src_attr_map[var_name])
+                dst_strategies[var_name] = copy.deepcopy(dst_attr_map[var_name])
+                src_strategies[var_name] = copy.deepcopy(src_attr_map[var_name])
                 tensors_dict[var_name] = dist_vars
 
         if src_attr_map == dst_attr_map:
             return tensors_dict
-        converter = Converter(tensors_dict, src_strategys, dst_strategys)
+        converter = Converter(tensors_dict, src_strategies, dst_strategies)
         convert_tensor_dict = converter.convert()
 
         return convert_tensor_dict
diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py
index a671582a3293f..900b90a0f6496 100644
--- a/python/paddle/distributed/auto_parallel/static/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -202,7 +202,7 @@ def _update_op_dims_mapping_and_distoperatorimpl(
 
     updated = dist_op_container.update_dims_mapping(dist_op)
     changed = updated or changed
-    # TODO(ljz) remove the below code once we introduce general reshard to replace specifc distopimpls
+    # TODO(ljz) remove the below code once we introduce general reshard to replace specific distopimpls
     reverted = dist_op_container.mapping_to_dist_operator_impl(
         dist_op, original_op_dist_attr
     )
diff --git a/python/paddle/distributed/auto_parallel/static/converter.py b/python/paddle/distributed/auto_parallel/static/converter.py
index c7cd4e32d6e42..241a83aaf4f5d 100644
--- a/python/paddle/distributed/auto_parallel/static/converter.py
+++ b/python/paddle/distributed/auto_parallel/static/converter.py
@@ -105,9 +105,9 @@ def convert(self, strict=True):
                 >>> import numpy as np
                 >>> from paddle.distributed.auto_parallel.static.converter import Converter
                 >>> complete_tensors = np.arange(4).reshape([2, 2])
-                >>> partitial_tensors = np.split(complete_tensors, 2, axis=0)
+                >>> partial_tensors = np.split(complete_tensors, 2, axis=0)
                 >>> name = "tmp_0"
-                >>> tensors_dict = {name: partitial_tensors}
+                >>> tensors_dict = {name: partial_tensors}
                 >>> strategy_1 = {
                 ...     name: {
                 ...         "process_shape": [2],
@@ -345,7 +345,7 @@ def slice_with_dist_attr(tensor, dist_attr):
     @staticmethod
     def merge(partition_tensor_list, tensor, partition_index, complete_shape):
         """
-        Merge partitial tensors to a complete.
+        Merge partial tensors to a complete.
 
         Returns:
             None
diff --git a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
index 957e5dba46bf0..495cff26844d7 100644
--- a/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/static/cost/base_cost.py
@@ -784,7 +784,7 @@ def comm_count(self):
             shape = None
             if self.op is not None:
                 vars = self.op.block.vars
-                # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided
+                # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overridden
                 try:
                     var_name = self.op.input("X")[0]
                 except:
diff --git a/python/paddle/distributed/auto_parallel/static/cost_model.py b/python/paddle/distributed/auto_parallel/static/cost_model.py
index 55690e4f3de8f..ad0f353815772 100644
--- a/python/paddle/distributed/auto_parallel/static/cost_model.py
+++ b/python/paddle/distributed/auto_parallel/static/cost_model.py
@@ -98,18 +98,18 @@ def init_comm_cost(self, cluster=None):
         # should get from `cluster`
         BANDWIDTH = 32 * 1024 / 1000  # MB/ms, V100 PCIe
         num_ranks = len(self.ranks)
-        comm_volumn = np.prod(self.input_shape) * 4
+        comm_volume = np.prod(self.input_shape) * 4
 
         if 'allreduce' in self.comm_type:
-            self._cost = comm_volumn / (
+            self._cost = comm_volume / (
                 BANDWIDTH * num_ranks / (2 * (num_ranks - 1))
             )
         elif 'gather' in self.comm_type:
-            self._cost = comm_volumn / (BANDWIDTH * num_ranks / (num_ranks - 1))
+            self._cost = comm_volume / (BANDWIDTH * num_ranks / (num_ranks - 1))
         elif 'broadcast' in self.comm_type:
-            self._cost = comm_volumn / BANDWIDTH
+            self._cost = comm_volume / BANDWIDTH
         elif 'send' in self.comm_type or 'recv' in self.comm_type:
-            self._cost = comm_volumn / BANDWIDTH
+            self._cost = comm_volume / BANDWIDTH
         else:
             self._cost = 0
 
diff --git a/python/paddle/distributed/auto_parallel/static/helper.py b/python/paddle/distributed/auto_parallel/static/helper.py
index c730a68e6ae49..e7bd7553d5094 100644
--- a/python/paddle/distributed/auto_parallel/static/helper.py
+++ b/python/paddle/distributed/auto_parallel/static/helper.py
@@ -211,8 +211,8 @@ class ProgramHelper:
 
     def __init__(self, layer, loss_func, metrics, inputs_spec, labels_spec):
         # original model config information
-        # TODO(Aurelius84): Implenet append_backward and optimizer in ProxyLayer
-        # after distribute engine satisify basic condition.
+        # TODO(Aurelius84): Implement append_backward and optimizer in ProxyLayer
+        # after distribute engine satisfy basic condition.
         self.proxy_layer = ProxyLayer(layer, loss_func, metrics)
         self.inputs_spec = inputs_spec
         self.labels_spec = labels_spec
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index fb924288988d1..27a13fd1d9107 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -231,7 +231,7 @@ def _generate_backward(
         # NOTE(zhaoyinglia):
         # Guarantee the order of params_grads is same between dynamic mode and static mode
         # by making parameter_list equal to model.parameters(),
-        # because the order affact the result of ClipGradByGLobalNorm.
+        # because the order affect the result of ClipGradByGLobalNorm.
         # If parameter_list is not None, the order of params_grads is same with parameter_list.
         # If parameter_list is None, params_grads will be as prog.global_block().all_parameters().
         with program_guard(main_program, startup_program):
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 2c3c4728d4f2e..62b79302f32dd 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1498,7 +1498,7 @@ def sharding_configs(self):
             This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
             Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
 
-            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation.
+            segment_anchors(list): list of anchors used to segment the program, which allows a finer control of program segmentation.
             this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
 
             sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index 81547d24878d5..c9ea552815a83 100755
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -1194,7 +1194,7 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
 
             dirname(str, optional): The saving directory path.
                                 When you need to save the parameter to the memory, set it to None.
-            main_program(Program, optional): The program whose persistbale tensors will
+            main_program(Program, optional): The program whose persistable tensors will
                                              be saved. Default: None.
 
 
@@ -1419,7 +1419,7 @@ def amp_init(
                 ...         init_loss_scaling=128.0,
                 ...         use_dynamic_loss_scaling=True,
                 ...         use_pure_fp16=True)
-                ...     # If you don't use the default_startup_program(), you sholud pass
+                ...     # If you don't use the default_startup_program(), you should pass
                 ...     # your defined `startup_program` into `minimize`.
                 ...     optimizer.minimize(loss)
                 ...     exe.run(paddle.static.default_startup_program())
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index dcb5e55f0c25a..146d8a627e5c5 100755
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -115,7 +115,7 @@ def _parse_args():
         "--backend",
         type=str,
         default=os.environ.get('PADDLE_DISTRI_BACKEND', 'auto'),
-        help="Specifize the backend, can be gloo|nccl|bkcl|auto|heter. "
+        help="Specify the backend, can be gloo|nccl|bkcl|auto|heter. "
         "Default value is auto which prefers nccl or bkcl.",
     )
     base_group.add_argument(
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 0b87df4a9c3af..c0a01d43fd688 100755
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -339,12 +339,12 @@ def terminate_local_procs(procs):
                 p.log_fn.close()
             logger.debug(f"terminate process id:{p.proc.pid}")
 
-    # wait all process terminiated
+    # wait all process terminated
     time.sleep(3)
     for step in range(0, 50):
         alive = False
         for p in procs:
-            if p.proc.poll() is None:  # not termniate
+            if p.proc.poll() is None:  # not terminate
                 os.kill(p.proc.pid, signal.SIGKILL)
                 alive = True
 
@@ -414,7 +414,7 @@ def __free_port():
         step += 1
         if step > 400:
             print(
-                "can't find avilable port and use the specified static port now!"
+                "can't find available port and use the specified static port now!"
             )
             return None
 
@@ -705,7 +705,7 @@ def get_gpus(gpus):
                 for x in gpus.split(',')
             ]
             logger.info(
-                f"Change selected_gpus into reletive values. --ips:{gpus} "
+                f"Change selected_gpus into relative values. --ips:{gpus} "
                 f"will change into relative_ips:{res_gpus} according to your "
                 f"CUDA_VISIBLE_DEVICES:{cuda_visible_devices_list}"
             )
@@ -736,7 +736,7 @@ def get_xpus(xpus):
                 for x in xpus.split(',')
             ]
             logger.info(
-                f"Change selected_xpus into reletive values. --ips:{xpus} "
+                f"Change selected_xpus into relative values. --ips:{xpus} "
                 f"will change into relative_ips:{res_xpus} according to your "
                 f"XPU_VISIBLE_DEVICES:{xpu_visible_devices_list}"
             )
@@ -859,9 +859,9 @@ def get_custom_endpoints(origin_endpoints, offset=0):
 #    assert paddle_pserver_endpoints != None
 #
 #    # hard code for paddlecloud custom-framework
-#    avilable_ports = os.getenv("TRAINER_PORTS", "").split(",")
+#    available_ports = os.getenv("TRAINER_PORTS", "").split(",")
 #    assert len(
-#        avilable_ports
+#        available_ports
 #    ) >= 2, "set paddle_ports_num >= 2 in config.ini for paddlecloud job submit"
 #
 #    # hard code for paddlecloud custom-framework
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
index 4924d523ded05..36833fd7b5a97 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -73,7 +73,7 @@ def _unscale(self, optimizer):
         if not self._use_dp_mode:
             self._found_inf = paddle.cast(self._found_inf, dtype="int32")
             # TODO(shenliang03) Since the minimize call in the optimizer is
-            # after the gradscaler, check_finite needs to synchronize global
+            # after the grad scaler, check_finite needs to synchronize global
             # information. In the future, we should use check_group
             paddle.distributed.all_reduce(
                 self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index 05f2a4f2a28d6..852e7ced16e4a 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -103,7 +103,7 @@ def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
             - 1: sync_calc
             - 2: reduce_sum_sharding (allreduce --> reduce)
             - 3: sync_comm
-            - 4: allreuce_sum_dp (dp_grads)
+            - 4: allreduce_sum_dp (dp_grads)
             - 5: sync_comm (dp_grads)
             - 6: op that use Var (dp_grads & sum)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
index 2ff259be18b79..1c10efb340618 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
@@ -32,7 +32,7 @@ def prune_weight_decay(self, block, shard):
                 continue
             if OP_ROLE_VAR_KEY not in op.attr_names:
                 raise ValueError(
-                    "The Weight Dacay op should hold op_role_var attribute"
+                    "The Weight Decay op should hold op_role_var attribute"
                     f"but the {op.type} op does not hold op_role_var"
                 )
             op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 5d2f561ca974d..298e84ace66f1 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -217,7 +217,7 @@ def _get_hybrid_dp_mode(self):
         # pipeline: communication across nodes, and therefore should insert in update segment,
         #           conduct just once per global step.
         dp_mode = None
-        # dp here is the pure dp as the outest parallelism
+        # dp here is the pure dp as the outermost parallelism
         if self.hybrid_dp:
             if self.pp_degree > 1:
                 dp_mode = "pp_hybrid_dp"
@@ -598,8 +598,8 @@ def _adapt_amp_clip_without_sharding(self):
         rings = [self.mp_ring_id, self.pp_ring_id]
         FP16Utils.sync_amp_check_nan_inf(main_block, rings)
 
-        gradientclip_helper = GradientClipHelper(None)
-        gradientclip_helper.sync_global_norm(
+        gradient_clip_helper = GradientClipHelper(None)
+        gradient_clip_helper.sync_global_norm(
             main_block, [self.mp_ring_id, self.pp_ring_id], self.mp_rank
         )
 
@@ -996,8 +996,8 @@ def _prune_main_program(self, block, shard, rings):
         4. prune optimizer op + param + gradient
 
         """
-        weightdecay_helper = WeightDecayHelper()
-        weightdecay_helper.prune_weight_decay(block, shard)
+        weight_decay_helper = WeightDecayHelper()
+        weight_decay_helper.prune_weight_decay(block, shard)
 
         # FIXME(wangxi): mp should prune duplicated param_grads
         # NOTE (JZ-LIANG) the sync of FoundInfinite should among one entire Model Parallelism
@@ -1006,8 +1006,8 @@ def _prune_main_program(self, block, shard, rings):
         FP16Utils.prune_fp16(block, shard, self._reduced_grads_to_param, rings)
 
         # clipbyglobalnorm should only use the Model parallelism group (mp-sharding-pp)
-        gradientclip_helper = GradientClipHelper(None)
-        gradientclip_helper.prune_gradient_clip(block, shard, rings)
+        gradient_clip_helper = GradientClipHelper(None)
+        gradient_clip_helper.prune_gradient_clip(block, shard, rings)
 
         # build prog deps
         reduced_grads = []
@@ -1645,7 +1645,7 @@ def _build_groups(self):
 
         # global group
         # use for gen_nccl_comm_sync, amp check nan inf, clip by global norm
-        # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree
+        # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be divided by dp_degree
         self.global_ring_id = 3
 
         logger.info(f"global word size: {self.global_word_size}")
@@ -1727,7 +1727,7 @@ def recreate_not_persist_param_as_var(program):
 
     def _initialization_broadcast(self):
         """
-        this funtion is to ensure the initialization between dp group to be
+        this function is to ensure the initialization between dp group to be
         identical when hybrid-dp is used, and the initialization of
         not distributed param between mp group to be identical.
         """
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 21e5dbfbffefc..384d89b4d9c12 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -170,7 +170,7 @@ def __init__(self, layers, hcg, strategy):
             'accumulate_steps'
         ]
         # If sent tensor are not the same from different hosts,
-        # they shouldn't been sent partially and then concated as a whole tensor.
+        # they shouldn't been sent partially and then concatenated as a whole tensor.
         self._enable_partial_send_recv = self._strategy.pipeline_configs[
             'enable_partial_send_recv'
         ]
@@ -640,7 +640,7 @@ def _prepare_training(self, data, optimizer, lr_scheduler):
 
     def _wrap_data(self, data):
         """
-        for backward compatibilty, wrap data to Fake FakeMicroDataset if it is of type list or tuple
+        for backward compatibility, wrap data to Fake FakeMicroDataset if it is of type list or tuple
         """
         if (not isinstance(data, tuple)) and (not isinstance(data, list)):
             return data
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 2a691c2c4d4fc..046143c79842f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -342,6 +342,6 @@ def cvt_to_device(x, dev_id, blocking=True):
         place = paddle.XPUPlace(dev_id)
     else:
         raise OSError(
-            "Only supported compiled paddle with gpu/rocm and xpu , but current verison is compiled with cpu."
+            "Only supported compiled paddle with gpu/rocm and xpu, but current version is compiled with cpu."
         )
     return x._copy_to(place, blocking)
diff --git a/python/paddle/distributed/fleet/recompute/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py
index 8cfa7fbec353d..b59f304d69a42 100644
--- a/python/paddle/distributed/fleet/recompute/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -93,7 +93,7 @@ def check_recompute_necessary(inputs):
 
 
 @contextlib.contextmanager
-def swith_rng_state_tracker(rng_state, tracker):
+def switch_rng_state_tracker(rng_state, tracker):
     orig_rng_state = paddle.get_rng_state()
     orig_rng_tracker = get_rng_state_tracker().get_states_tracker()
     paddle.set_rng_state(rng_state)
@@ -155,8 +155,8 @@ def forward(ctx, run_function, preserve_rng_state, *args, **kwargs):
                 ctx.inputs.append(arg)
         ctx.save_for_backward(*tensor_inputs)
 
-        # NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
-        # one process with multiple gpu and mix-gpu-cpu senarios are not support
+        # NOTE recompute with restore RNG only support one scenario where one process for one cuda gpu.
+        # one process with multiple gpu and mix-gpu-cpu scenarios are not support
         if ctx.preserve_rng_state:
             ctx.fw_rng_state = paddle.get_rng_state()
             ctx.fwd_rng_state_tracker = (
@@ -208,7 +208,7 @@ def backward(ctx, *args):
             # NOTE support AMP
             # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
-                with swith_rng_state_tracker(
+                with switch_rng_state_tracker(
                     ctx.fw_rng_state, ctx.fwd_rng_state_tracker
                 ):
                     with paddle.amp.auto_cast(
@@ -273,7 +273,7 @@ def backward(ctx, *args):
                         # all tensors in the tuple doesn't need grad, only return a None for the whole tuple
                         grads.append(None)
                     else:
-                        # all tensors in the tuple nees grad, should return a tuple of grads
+                        # all tensors in the tuple need grad, should return a tuple of grads
                         grads.append(tuple(i._grad_ivar() for i in inp))
 
             if in_dynamic_mode():
@@ -303,7 +303,7 @@ def _recompute_without_reentrant(
             fw_cuda_rng_state = paddle.get_rng_state(cur_device)
         else:
             raise RuntimeError(
-                "Recompute with RNG perserve is not support current device: {}.".format(
+                "Recompute with RNG preserve is not support current device: {}.".format(
                     cur_device
                 )
             )
@@ -358,10 +358,10 @@ def inner_pack(inner_x):
                 return
 
             def inner_unpack(inner_x):
-                raise Exception("An unexcepted backward called on a tensor!")
+                raise Exception("An unexpected backward called on a tensor!")
 
             if preserve_rng_state:
-                with swith_rng_state_tracker(
+                with switch_rng_state_tracker(
                     fw_cuda_rng_state, fwd_cuda_rng_state_tracker
                 ):
                     with paddle.set_grad_enabled(True):
diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
index 789f0cac73d94..29e7c73459854 100644
--- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -22,7 +22,7 @@
 from .recompute import (
     check_recompute_necessary,
     detach_variable,
-    swith_rng_state_tracker,
+    switch_rng_state_tracker,
 )
 
 __all__ = []
@@ -198,7 +198,7 @@ def backward(ctx, *args):
             tracer._has_grad = True
 
             # need restore auto_cast state as well as w/b list
-            with swith_rng_state_tracker(
+            with switch_rng_state_tracker(
                 ctx.fwd_rng_state, ctx.fwd_rng_state_tracker
             ):
                 if ctx.is_fw_autocast:
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 865571cfeca6f..f69470397e1d9 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -43,7 +43,7 @@ def _set_basic_info(self, context):
         self.origin_main_program = context["origin_main_program"]
         self.origin_startup_program = context["origin_startup_program"]
         self.async_strategy = self._get_distributed_strategy()
-        self.compiled_strategy = self.build_compiled_startegy()
+        self.compiled_strategy = self.build_compiled_strategy()
 
     def _get_distributed_strategy(self):
         strategy = None
@@ -69,7 +69,7 @@ def _get_distributed_strategy(self):
 
         return strategy
 
-    def build_compiled_startegy(self):
+    def build_compiled_strategy(self):
         from paddle.incubate.distributed.fleet.parameter_server.ir.public import (
             CompileTimeStrategy,
         )
@@ -203,7 +203,7 @@ def get_sparse_attrs():
 
                 if len(dist_varnames) != 0:
                     raise ValueError(
-                        "GeoStrategy can not support large scale embeding now, please use paddle.static.nn.embedding"
+                        "GeoStrategy can not support large scale embedding now, please use paddle.static.nn.embedding"
                     )
 
                 init_attrs = []
@@ -354,11 +354,11 @@ def _init_server(self, *args, **kwargs):
         sparse_related_optimize_varnames = list(
             set(sparse_related_optimize_varnames)
         )
-        distribtued_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
+        distributed_varnames = self.compiled_strategy.get_sparse_varname_on_ps(
             True
         )
         distributed_related_optimize_varnames = []
-        for var_name in distribtued_varnames:
+        for var_name in distributed_varnames:
             distributed_related_optimize_varnames += (
                 self.compiled_strategy.get_optimize_varname_on_ps(var_name)
             )
@@ -370,7 +370,7 @@ def _init_server(self, *args, **kwargs):
             filter(
                 ParameterServerRuntime.__exclude_vars(
                     sparse_varnames
-                    + distribtued_varnames
+                    + distributed_varnames
                     + sparse_related_optimize_varnames
                     + distributed_related_optimize_varnames
                 ),
@@ -402,7 +402,7 @@ def _init_server(self, *args, **kwargs):
         # load large scale
         self._load_distributed_params(
             dirname=model_dirname,
-            varnames=distribtued_varnames
+            varnames=distributed_varnames
             + distributed_related_optimize_varnames,
         )
 
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index a14c337a4fad1..94d403765b1a0 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -684,7 +684,7 @@ def _set_basic_info(self, context):
         self.origin_main_program = context["origin_main_program"]
         self.origin_startup_program = context["origin_startup_program"]
         self.async_strategy = self._get_distributed_strategy()
-        self.compiled_strategy = self.build_compiled_startegy()
+        self.compiled_strategy = self.build_compiled_strategy()
 
     def _get_distributed_strategy(self):
         strategy = None
@@ -712,7 +712,7 @@ def _get_distributed_strategy(self):
             strategy.use_ps_gpu = True
         return strategy
 
-    def build_compiled_startegy(self):
+    def build_compiled_strategy(self):
         from paddle.incubate.distributed.fleet.parameter_server.ir.public import (
             CompileTimeStrategy,
         )
@@ -1125,8 +1125,8 @@ def _get_tables():
             if len(tensor_table_dict) > 0:
                 tables = _add_tensor_table(tables)
             else:
-                empty_porgram = Program()
-                self._server_sub_program.append(empty_porgram.desc)
+                empty_program = Program()
+                self._server_sub_program.append(empty_program.desc)
 
             barrier_table = _build_barrier_table(len(tables))
             tables.append(barrier_table)
diff --git a/python/paddle/distributed/fleet/scaler.py b/python/paddle/distributed/fleet/scaler.py
index 40e182e7f2e40..977b336eb31bb 100755
--- a/python/paddle/distributed/fleet/scaler.py
+++ b/python/paddle/distributed/fleet/scaler.py
@@ -139,7 +139,7 @@ def unscale_method(self, optimizer):
         self._found_inf = self._found_inf.cast("int32")
 
         # TODO(shenliang03) Since dp allreduce in the optimizer is
-        # after the gradscaler, check_finite needs to synchronize global
+        # after the grad scaler, check_finite needs to synchronize global
         # information. In the future, we should use check_group to speed.
         paddle.distributed.all_reduce(
             self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index aa7ec2e544efe..5c2ec7fece24d 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -994,7 +994,7 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
             fs_src_path(str):  Name of the file or directory, that's needed to be moved.
             fs_dst_path(str):  Name of the file or directory to which to move to.
             overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
-            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption.
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Exception.
 
         Examples:
 
@@ -1177,7 +1177,7 @@ def _split_files(self, files, trainer_id, trainers):
             trainer_id(int): trainer mpi rank id
             trainers(int): all trainers num
         Returns:
-            fileist(list): file list of current trainer
+            filelist(list): file list of current trainer
         """
         remainder = len(files) % trainers
         blocksize = len(files) // trainers
@@ -1200,7 +1200,7 @@ def list_files_info(self, path_list):
         Args:
             path_list(list): file list
         Returns:
-            fileist(list): file list with file path and size
+            filelist(list): file list with file path and size
         """
         if len(path_list) <= 0:
             return []
@@ -1650,7 +1650,7 @@ def _split_files(self, files, trainer_id, trainers):
             trainer_id(int): trainer mpi rank id
             trainers(int): all trainers num
         Returns:
-            fileist(list): file list of current trainer
+            filelist(list): file list of current trainer
         """
         remainder = len(files) % trainers
         blocksize = len(files) // trainers
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
index d8142b7081f2b..38e6eeca008d6 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
@@ -63,7 +63,7 @@ class HybridParallelInferenceHelper:
             ...     with paddle.base.device_guard(f'{device}:all'):
             ...         # read data from global lod_tensor_array
             ...         element_in_arr = paddle.tensor.array_read(array=arr, i=step_idx)
-            ...         # write placehold data to global lod_tensor_array,
+            ...         # write placeholder data to global lod_tensor_array,
             ...         # it need for send_v2 of lod_tensor_array
             ...         paddle.increment(x=step_idx, value=1.0)
             ...         paddle.tensor.array_write(element_in_arr, i=step_idx, array=arr)
@@ -455,7 +455,7 @@ def _find_prev_op(self, index, var_name):
 
     def _add_op_device_attr(self, block):
         """
-        Add op_device attrribute for ops in block that have
+        Add op_device attribute for ops in block that have
         not that attribute set.
 
         Args:
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index fc0f897b1454c..27aa4c9f54074 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -263,7 +263,7 @@ def fused_allreduce_gradients(parameter_list, hcg):
 
 
 def broadcast_sharding_parameters(model, hcg):
-    # TODO TO save memory, use un-fused broadcast to avoid potentional OOM
+    # TODO TO save memory, use un-fused broadcast to avoid potential OOM
     logger.debug("sharding start init parameters sync")
     sharding_parallel_group = hcg.get_sharding_parallel_group()
     src_rank = hcg.get_sharding_parallel_group_src_rank()
@@ -273,7 +273,7 @@ def broadcast_sharding_parameters(model, hcg):
 
 
 def broadcast_sep_parameters(model, hcg):
-    # TODO TO save memory, use un-fused broadcast to avoid potentional OOM
+    # TODO TO save memory, use un-fused broadcast to avoid potential OOM
     logger.debug("sep start init parameters sync")
     sep_group = hcg.get_sep_parallel_group()
     src_rank = hcg.get_sep_parallel_group_src_rank()
diff --git a/python/paddle/distributed/fleet/utils/mix_precision_utils.py b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
index 7b4ff7a0410e5..bbc632029a59b 100644
--- a/python/paddle/distributed/fleet/utils/mix_precision_utils.py
+++ b/python/paddle/distributed/fleet/utils/mix_precision_utils.py
@@ -47,7 +47,7 @@ def __init__(self, layers, dtype="float16"):
                 param._register_grad_hook(self._update_main_grad_hook(param))
 
     def _update_main_grad_hook(self, param):
-        """Create the update_main_grad hook for backprop."""
+        """Create the update_main_grad hook for back-prop."""
 
         # Hook used for back-prop and grad-merge.
         @paddle.autograd.no_grad()
diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
index 959f9eb49f40f..dff62c1a22db1 100644
--- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
+++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
@@ -783,7 +783,7 @@ def fused_parameters(
     :param fuse_param: fuse param or not
     :param scale_after_comm: if enable comm overlap, specify the location of grad scale
     :param group_params: the format of the input parameters is param group
-    :param apply_decay_param_fun: the funtion to filter decay param
+    :param apply_decay_param_fun: the function to filter decay param
     :return: param storage if fused, comm buffers if comm overlap, param groups if use group params
     """
     if act is None:
diff --git a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
index 9ca0a7fdfc89f..88cb6ff27b1aa 100644
--- a/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/tensor_parallel_utils.py
@@ -44,7 +44,7 @@ def tensor_parallel_sync_filter_fn(
     param, pos_emb=True, layer_norm=True, bias=True
 ):
     """
-    Layer fliter function for tensor parallelism transformer.
+    Layer filter function for tensor parallelism transformer.
 
     In tensor parallelism of transformer like model, there is 4 kind of param
     that are supposed to be the same in all tensor parallel peers:
@@ -111,7 +111,7 @@ def copy_parameters(block_, params):
         )
         assert (
             param.is_distributed is False
-        ), f"Try to sync Distribted Parameter: {param}"
+        ), f"Try to sync Distributed Parameter: {param}"
         new_p.is_distributed = False
 
     block_.vars[new_p.name] = new_p
@@ -291,7 +291,7 @@ def add_extra_synchronization(
 
     sync_mode(string): select from
         "broadcast": parameter is sync by broadcasted from 'src_rank' to all other ranks.
-        "average": paramter is sync by average amonge all ranks
+        "average": parameter is sync by average among all ranks
 
     src_rank(int): the src used in broadcast sync_mode.
 
@@ -324,7 +324,7 @@ def add_extra_synchronization(
         if params_filter_fn(param):
             params_to_sync.append(param)
     logger.info(
-        "The following param are goning to be synchronization everytime the optimizer update phase of the program is runned: "
+        "The following param are going to be synchronization everytime the optimizer update phase of the program is runned: "
     )
     logger.info([p.name for p in params_to_sync])
 
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index 0c326c91f5cc6..3bee69f5d7deb 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -91,7 +91,7 @@ def get_logger(self, level=logging.INFO):
         logger.addHandler(ch)
         return logger
 
-    def continous_log(self) -> bool:
+    def continuous_log(self) -> bool:
         if self.args.log_level.upper() in ['DEBUG', 'ERROR']:
             return True
         else:
@@ -102,6 +102,6 @@ def set_env_in_args(self):
             attr, attr_type = v
             if k in self.envs:
                 print(
-                    f"LAUNCH WARNNING args {attr} will be overridden by env: {k} value: {self.envs[k]}"
+                    f"LAUNCH WARNING args {attr} will be overridden by env: {k} value: {self.envs[k]}"
                 )
                 setattr(self.args, attr, attr_type(self.envs[k]))
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 4553ea1bb776b..e6eae1a94e3f6 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -95,7 +95,7 @@ def watch(self) -> bool:
         while not self.ctx.status.is_done():
             status = self.pod.watch(timeout=2)
 
-            # if self.ctx.continous_log():
+            # if self.ctx.continuous_log():
             # default to print log
             self.pod.logs()
 
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index 5a2009b2fd0f2..4ebda8bc64c25 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -59,7 +59,7 @@ def _number_count(numbers, upper_range):
 def _assign_pos(x, cum_count):
     """
     Assign pos decides which tokens should be fetched belong to
-    specially expert orderingly.
+    specially expert orderly.
 
     Args:
         x (Tensor): Tensor. Every element in the list must be a Tensor whose data type
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index f2b2c140cd6cf..c820a3d882274 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -63,7 +63,7 @@ class DataParallelOptimizationPass(PassBase):
 
     def __init__(self):
         super().__init__()
-        # NOTE not use depence on loss and param_grads
+        # NOTE not use dependence on loss and param_grads
         self.set_attr("dist_context", None)
         self.set_attr("global_rank", -1)
         self.set_attr("use_sharding", False)
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 9fe72c8aabd75..822bdbd6801b2 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -293,7 +293,7 @@ def _check_self(self):
     def _check_conflict(self, other_pass):
         return True
 
-    def get_ops_per_device(self, ops, all_ops_process_meshs, sr=0):
+    def get_ops_per_device(self, ops, all_ops_process_meshes, sr=0):
         """
         Get ops and op_names of each process mesh excluding ops within the first "sr" chunks
         """
@@ -302,7 +302,7 @@ def reset_recompute_op(op):
             if is_recompute_op(op) or is_recompute_exclude_op(op):
                 op._set_attr("op_namescope", "")
 
-        all_process_meshes_count = len(all_ops_process_meshs)
+        all_process_meshes_count = len(all_ops_process_meshes)
         ops_of_stages = [[] for _ in range(all_process_meshes_count)]
         op_names_of_stages = [[] for _ in range(all_process_meshes_count)]
         pushed_ops_count = 0
@@ -321,7 +321,7 @@ def reset_recompute_op(op):
             if chunk_id // all_process_meshes_count < sr:
                 continue
 
-            for id, process_mesh in enumerate(all_ops_process_meshs):
+            for id, process_mesh in enumerate(all_ops_process_meshes):
                 if op.dist_attr.process_mesh == process_mesh:
                     pushed_ops_count += 1
                     ops_of_stages[id].append(op)
@@ -346,15 +346,15 @@ def _apply_single_impl(self, main_program, startup_program, context):
         op_path = _find_op_path(main_program, loss, no_grad_set)
 
         # 1. mark exclude ops for refined-recompute according to ops-patterns(mainly linear and flash_attn)
-        # 1.1 get all process_meshs in op_path
-        all_ops_process_meshs = []
+        # 1.1 get all process_meshes in op_path
+        all_ops_process_meshes = []
         for op in op_path:
-            if op.dist_attr.process_mesh not in all_ops_process_meshs:
-                all_ops_process_meshs.append(op.dist_attr.process_mesh)
+            if op.dist_attr.process_mesh not in all_ops_process_meshes:
+                all_ops_process_meshes.append(op.dist_attr.process_mesh)
 
         # 1.2 get ops_devices and op_names_devices
         ops_devices, op_names_devices = self.get_ops_per_device(
-            op_path, all_ops_process_meshs, self._sr
+            op_path, all_ops_process_meshes, self._sr
         )
         all_ops_len = len(op_path)
         all_exclude_ops_ids = [[] for _ in op_names_devices]
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index adddb37d26b43..617425158dd89 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -1187,7 +1187,7 @@ def _overlap_grad_comm(
             2.2 insert after communication dependencies only when need
         3. there is not need to add explicit dependencies for non-coalesce gradient communication
 
-        P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream awared allocator.
+        P.S. this overlap pass is ONLY adapted for standalone executor (graph based) and stream award allocator.
         """
 
         if not self.enable_overlap:
@@ -1309,7 +1309,7 @@ def _overlap_grad_comm(
         # hierarchical grad comm
         if self.enable_hierarchical_comm:
             # NOTE so far we only support Isomorphic cluster with 8 ranks per node
-            # TODO unifiy here create communicators
+            # TODO unify here create communicators
             # create communicators
             nranks_per_node = 8
             assert self.sharding_world_size % nranks_per_node == 0
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 113f5275d8e7b..eb3e0368c49a8 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -890,8 +890,8 @@ def _create_heter_program(
         #         joint_var.0_1 -> slice -> reshape -> origin_var
         #         origin_var -> origin_program
         #         reshape -> concat -> joint_var.1_2
-        #     d) copy send op from origin program for var@grad which loacted in current heter block
-        #     e) re-check every op in current blcok if its device is not current heter devie
+        #     d) copy send op from origin program for var@grad which located in current heter block
+        #     e) re-check every op in current block if its device is not current heter device
         # 2. Create send op for step counter in last heter-block
         # 3. Create Listen&Serv OP and Send&Recv OP for distributed training
         # 4. update CompileTimeStrategy for heter_program
diff --git a/python/paddle/distributed/ps/utils/collective_transpiler.py b/python/paddle/distributed/ps/utils/collective_transpiler.py
index 7f398842fd701..8d0ff9a53e551 100644
--- a/python/paddle/distributed/ps/utils/collective_transpiler.py
+++ b/python/paddle/distributed/ps/utils/collective_transpiler.py
@@ -357,7 +357,7 @@ def _insert_allreduce_ops(self):
                         )
                         offset += 1
 
-                    # As we search ops reversedly, we should insert c_allreduce_sum
+                    # As we search ops reversely, we should insert c_allreduce_sum
                     # op in the same way to keep the ring_id alternate
                     ring_id = (ring_id + 1) % self.nrings
                     block._insert_op(
@@ -631,7 +631,7 @@ def _insert_allgather_ops(self):
                         )
                         offset += 1
 
-                    # As we search ops reversedly, we should insert c_allgather
+                    # As we search ops reversely, we should insert c_allgather
                     # op in the same way to keep the ring_id alternate
                     ring_id = (ring_id + 1) % self.nrings
                     block._insert_op(

From 34871d28c9fe2727c5cb9d5a26cf2cb690b2f920 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:49:48 +0800
Subject: [PATCH 26/82]  Fix typo (Arraow -> Arrow) (#61921)

---
 paddle/pir/src/core/parser/lexer.cc | 6 +++---
 paddle/pir/src/core/parser/lexer.h  | 2 +-
 paddle/pir/src/core/parser/token.h  | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/pir/src/core/parser/lexer.cc b/paddle/pir/src/core/parser/lexer.cc
index 54c39e8006ba0..7914063d148c0 100644
--- a/paddle/pir/src/core/parser/lexer.cc
+++ b/paddle/pir/src/core/parser/lexer.cc
@@ -18,7 +18,7 @@ Token Lexer::ConsumeToken() {
   SkipWhitespace();
   if (auto token = LexIdentifier()) {
     return *token;
-  } else if (auto token = LexNumberOrArraow()) {
+  } else if (auto token = LexNumberOrArrow()) {
     return *token;
   } else if (auto token = LexEndTagOrNullVal()) {
     return *token;
@@ -84,7 +84,7 @@ std::unique_ptr<Token> Lexer::LexIdentifier() {
   return token;
 }
 
-std::unique_ptr<Token> Lexer::LexNumberOrArraow() {
+std::unique_ptr<Token> Lexer::LexNumberOrArrow() {
   if (!isdigit(is.peek()) && is.peek() != '-') {
     return nullptr;
   }
@@ -94,7 +94,7 @@ std::unique_ptr<Token> Lexer::LexNumberOrArraow() {
 
   if (token_digit[0] == '-' && is.peek() == '>') {
     GetChar();
-    std::unique_ptr<Token> arrow_token(new Token{"->", ARRAOW});
+    std::unique_ptr<Token> arrow_token(new Token{"->", ARROW});
     return arrow_token;
   }
   while (isdigit(is.peek())) {
diff --git a/paddle/pir/src/core/parser/lexer.h b/paddle/pir/src/core/parser/lexer.h
index 30365172b686f..6606b2291d9a6 100644
--- a/paddle/pir/src/core/parser/lexer.h
+++ b/paddle/pir/src/core/parser/lexer.h
@@ -30,7 +30,7 @@ class Lexer {
   Token ConsumeToken();
   Token PeekToken();
   std::unique_ptr<Token> LexIdentifier();
-  std::unique_ptr<Token> LexNumberOrArraow();
+  std::unique_ptr<Token> LexNumberOrArrow();
   std::unique_ptr<Token> LexEndTagOrNullVal();
   std::unique_ptr<Token> LexValueId();
   std::unique_ptr<Token> LexEOF();
diff --git a/paddle/pir/src/core/parser/token.h b/paddle/pir/src/core/parser/token.h
index 6fe9e7bd79a3d..fd50ff1a7c580 100644
--- a/paddle/pir/src/core/parser/token.h
+++ b/paddle/pir/src/core/parser/token.h
@@ -23,7 +23,7 @@ enum Token_type {
   ENDTAG = 3,
   VALUEID = 4,
   STRING = 5,
-  ARRAOW = 6,
+  ARROW = 6,
   NULL_ = 7,
 };
 

From fa211197faeebeb11b11376ef786aeced0262c65 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:51:00 +0800
Subject: [PATCH 27/82] Update inference_lib.cmake (#61881)

---
 cmake/inference_lib.cmake | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 9f1268ce36c41..f4a8286985094 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -66,7 +66,7 @@ function(copy TARGET)
   endforeach()
 endfunction()
 
-function(copy_part_of_thrid_party TARGET DST)
+function(copy_part_of_third_party TARGET DST)
   if(${CBLAS_PROVIDER} STREQUAL MKLML)
     set(dst_dir "${DST}/third_party/install/mklml")
     if(WIN32)
@@ -233,7 +233,7 @@ copy(
   SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
   DSTS ${PADDLE_INFERENCE_INSTALL_DIR})
 
-copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
+copy_part_of_third_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 
@@ -365,7 +365,7 @@ add_custom_command(
 set(PADDLE_INFERENCE_C_INSTALL_DIR
     "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir"
     CACHE STRING "A path setting CAPI paddle inference shared")
-copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
+copy_part_of_third_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)

From 3431e994064fb75c606148308892459fbeba4d1d Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:56:30 +0800
Subject: [PATCH 28/82] Update linalg.py (#61746)

---
 python/paddle/tensor/linalg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 0e3c4922be6ba..5ff36cdb754d5 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1514,7 +1514,7 @@ def svd_norm(input, porder, axis=[-1]):
                     type='elementwise_div',
                     inputs={'X': max_out, 'Y': min_out},
                     outputs={'Out': out},
-                    attrs={'aixs': axis},
+                    attrs={'axis': -1},
                 )
                 return out
             if porder == -2:
@@ -1522,7 +1522,7 @@ def svd_norm(input, porder, axis=[-1]):
                     type='elementwise_div',
                     inputs={'X': min_out, 'Y': max_out},
                     outputs={'Out': out},
-                    attrs={'aixs': axis},
+                    attrs={'axis': -1},
                 )
                 return out
 

From 7568f4824d63567ae8b5cfc0736c4fd507790cd0 Mon Sep 17 00:00:00 2001
From: yinwei <yinwei_hust@163.com>
Date: Fri, 23 Feb 2024 15:02:02 +0800
Subject: [PATCH 29/82] memory_efficient_attention support q,k,v stop_gradient
 (#60594)

---
 paddle/phi/infermeta/backward.cc              |  48 +++----
 .../memory_efficient_attention_grad_kernel.cu |  35 +++++-
 .../test_memory_efficient_attention.py        | 118 ++++++++++++++++++
 3 files changed, 176 insertions(+), 25 deletions(-)

diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 4f525ef138735..845a8e6835729 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -746,27 +746,33 @@ void MemoryEfficientAttentionGradInferMeta(const MetaTensor& query,
   const int64_t value_num_head = value.dims()[2];
   const int64_t value_head_size = value.dims()[3];
 
-  std::vector<int64_t> query_grad_dims(
-      {query_batch_size, query_seq_length, query_num_head, query_head_size});
-  std::vector<int64_t> key_grad_dims(
-      {key_batch_size, key_seq_length, key_num_head, key_head_size});
-  std::vector<int64_t> value_grad_dims(
-      {value_batch_size, value_seq_length, value_num_head, value_head_size});
-
-  query_grad->set_dims(common::make_ddim(query_grad_dims));
-  query_grad->share_lod(query);
-  query_grad->set_dtype(query.dtype());
-  query_grad->set_layout(query.layout());
-
-  key_grad->set_dims(common::make_ddim(key_grad_dims));
-  key_grad->share_lod(key);
-  key_grad->set_dtype(key.dtype());
-  key_grad->set_layout(key.layout());
-
-  value_grad->set_dims(common::make_ddim(value_grad_dims));
-  value_grad->share_lod(value);
-  value_grad->set_dtype(value.dtype());
-  value_grad->set_layout(value.layout());
+  if (query_grad) {
+    std::vector<int64_t> query_grad_dims;
+    query_grad_dims = {
+        query_batch_size, query_seq_length, query_num_head, query_head_size};
+    query_grad->set_dims(common::make_ddim(query_grad_dims));
+    query_grad->share_lod(query);
+    query_grad->set_dtype(query.dtype());
+    query_grad->set_layout(query.layout());
+  }
+  if (key_grad) {
+    std::vector<int64_t> key_grad_dims;
+    key_grad_dims = {
+        key_batch_size, key_seq_length, key_num_head, key_head_size};
+    key_grad->set_dims(common::make_ddim(key_grad_dims));
+    key_grad->share_lod(key);
+    key_grad->set_dtype(key.dtype());
+    key_grad->set_layout(key.layout());
+  }
+  if (value_grad) {
+    std::vector<int64_t> value_grad_dims;
+    value_grad_dims = {
+        value_batch_size, value_seq_length, value_num_head, value_head_size};
+    value_grad->set_dims(common::make_ddim(value_grad_dims));
+    value_grad->share_lod(value);
+    value_grad->set_dtype(value.dtype());
+    value_grad->set_layout(value.layout());
+  }
 
   if (bias && bias_grad) {
     const int64_t bias_batch_size = bias.dims()[0];
diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
index c72a1b69e7ef8..62625936e192a 100644
--- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
@@ -58,8 +58,14 @@ void MemoryEfficientAttentionGradKernel(
     DenseTensor* bias_grad) {
   bool kernel_launched = false;
 
+  DenseTensor dq_tmp;
+  DenseTensor dk_tmp;
+  DenseTensor dv_tmp;
+  bool has_query_grad = (query_grad != nullptr);
+  bool has_key_grad = (key_grad != nullptr);
+  bool has_value_grad = (value_grad != nullptr);
+
   auto launchKernel = [&](auto k_, auto kernel_fn) {
-    // ndim
     PADDLE_ENFORCE_EQ(
         query.dims().size(),
         output_grad.dims().size(),
@@ -289,7 +295,6 @@ void MemoryEfficientAttentionGradKernel(
     int compute_capacity = ctx.GetComputeCapability();
     const auto max_shmem =
         getMaximumSharedMemoryPerBlockKb(compute_capacity) * 1024;
-
     using KernelType = decltype(k_);
     using scalar_t = typename KernelType::scalar_t;
     if (kernel_launched) {
@@ -404,9 +409,28 @@ void MemoryEfficientAttentionGradKernel(
     VLOG(3) << "logsumexp_ptr" << p.logsumexp_ptr;
     p.output_ptr = phi::SafeGetTensorPtr<scalar_t>(output);
     p.grad_output_ptr = phi::SafeGetTensorPtr<scalar_t>(output_grad);
+
+    if (!has_query_grad) {
+      dq_tmp.clear();
+      dq_tmp = EmptyLike<T, Context>(ctx, query);
+      query_grad = &dq_tmp;
+    }
     p.grad_query_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, query_grad);
+
+    if (!has_key_grad) {
+      dk_tmp.clear();
+      dk_tmp = EmptyLike<T, Context>(ctx, key);
+      key_grad = &dk_tmp;
+    }
     p.grad_key_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, key_grad);
+
+    if (!has_value_grad) {
+      dv_tmp.clear();
+      dv_tmp = EmptyLike<T, Context>(ctx, value);
+      value_grad = &dv_tmp;
+    }
     p.grad_value_ptr = phi::SafeAllocTensor<scalar_t, Context>(ctx, value_grad);
+
     p.delta_ptr = phi::SafeGetTensorPtr<float>(delta);
     PD_MEA_CHECK_OVERFLOW(p.head_dim, q_dims[3]);
     PD_MEA_CHECK_OVERFLOW(p.head_dim_value, v_dims[3]);
@@ -444,11 +468,14 @@ void MemoryEfficientAttentionGradKernel(
     PD_MEA_CHECK_OVERFLOW(p.o_strideB, DimStride(output.dims(), 0));
 
     PD_MEA_CHECK_OVERFLOW(p.gQ_strideH, DimStride(query_grad->dims(), 2));
-    PD_MEA_CHECK_OVERFLOW(p.gK_strideH, DimStride(key_grad->dims(), 2));
-    PD_MEA_CHECK_OVERFLOW(p.gV_strideH, DimStride(value_grad->dims(), 2));
     PD_MEA_CHECK_OVERFLOW(p.gQ_strideB, DimStride(query_grad->dims(), 0));
+
+    PD_MEA_CHECK_OVERFLOW(p.gK_strideH, DimStride(key_grad->dims(), 2));
     PD_MEA_CHECK_OVERFLOW(p.gK_strideB, DimStride(key_grad->dims(), 0));
+
+    PD_MEA_CHECK_OVERFLOW(p.gV_strideH, DimStride(value_grad->dims(), 2));
     PD_MEA_CHECK_OVERFLOW(p.gV_strideB, DimStride(value_grad->dims(), 0));
+
     p.gQKV_strideM_multiplier = 1;
     PADDLE_ENFORCE_EQ(q_dims[2] * q_dims[3],
                       DimStride(query_grad->dims(), 1),
diff --git a/test/legacy_test/test_memory_efficient_attention.py b/test/legacy_test/test_memory_efficient_attention.py
index 24e1d5115f44f..6298a3100a930 100644
--- a/test/legacy_test/test_memory_efficient_attention.py
+++ b/test/legacy_test/test_memory_efficient_attention.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 import random
 import re
@@ -378,5 +379,122 @@ def setUp(self):
         self.seed = 2023
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
+)
+class TestMemEffAttentionAPIWithStopGradient(unittest.TestCase):
+    def setUp(self):
+        self.name = "MemEffAttnQKV_FFF"
+        self.place = paddle.CUDAPlace(0)
+        self.shape = (1, 128, 8, 16)
+        self.dtype = 'float32'
+        self.dropout = 0.0
+        self.training = True
+        self.attention_bias = None
+        self.scale = 1.0 / np.sqrt(self.shape[-1])
+        self.seed = 2023
+        self.q_grad_stop_gradient = True
+        self.k_grad_stop_gradient = False
+        self.v_grad_stop_gradient = False
+
+    def test_all(self):
+        logging.info(
+            f"Test All case shape {self.shape} dtype {self.dtype} name {self.name}"
+        )
+
+        paddle.disable_static()
+
+        query = np.random.random(self.shape)
+        q = paddle.to_tensor(
+            query,
+            place=self.place,
+            dtype=self.dtype,
+            stop_gradient=self.q_grad_stop_gradient,
+        )
+        q_ = paddle.to_tensor(
+            query,
+            place=self.place,
+            dtype=self.dtype,
+            stop_gradient=self.q_grad_stop_gradient,
+        )
+        key = np.random.random(self.shape)
+        k = paddle.to_tensor(
+            key,
+            place=self.place,
+            dtype=self.dtype,
+            stop_gradient=self.k_grad_stop_gradient,
+        )
+        k_ = paddle.to_tensor(
+            key,
+            place=self.place,
+            dtype=self.dtype,
+            stop_gradient=self.k_grad_stop_gradient,
+        )
+        value = np.random.random(self.shape)
+        v = paddle.to_tensor(
+            value,
+            place=self.place,
+            dtype=self.dtype,
+            stop_gradient=self.v_grad_stop_gradient,
+        )
+        v_ = paddle.to_tensor(
+            value,
+            place=self.place,
+            dtype=self.dtype,
+            stop_gradient=self.v_grad_stop_gradient,
+        )
+
+        out_ = attention_naive(
+            q_, k_, v_, self.attention_bias, self.dropout, self.scale, self.seed
+        )
+
+        paddle.seed(self.seed)
+        out = memory_efficient_attention(
+            q,
+            k,
+            v,
+            self.attention_bias,
+            self.dropout,
+            self.scale,
+            self.training,
+        )
+
+        np.testing.assert_allclose(out.numpy(), out_, rtol=5e-03, atol=1e-03)
+
+        out.backward()
+        out_.backward()
+
+        if q.stop_gradient is not True:
+            np.testing.assert_allclose(
+                q.grad.numpy(), q_.grad.numpy(), rtol=5e-03, atol=1e-03
+            )
+
+        if k.stop_gradient is not True:
+            np.testing.assert_allclose(
+                k.grad.numpy(), k.grad.numpy(), rtol=5e-03, atol=1e-03
+            )
+        if v.stop_gradient is not True:
+            np.testing.assert_allclose(
+                v.grad.numpy(), v_.grad.numpy(), rtol=5e-03, atol=1e-03
+            )
+
+
+class TestQKVFTT(TestMemEffAttentionAPIWithStopGradient):
+    def setUp(self):
+        self.name = "MemEffAttnQKV_TTT"
+        self.place = paddle.CUDAPlace(0)
+        self.shape = (1, 128, 8, 16)
+        self.dtype = 'float32'
+        self.dropout = 0.0
+        self.training = True
+        self.attention_bias = None
+        self.scale = 1.0 / np.sqrt(self.shape[-1])
+        self.seed = 2023
+        self.q_grad_stop_gradient = False
+        self.k_grad_stop_gradient = True
+        self.v_grad_stop_gradient = True
+
+
 if __name__ == '__main__':
     unittest.main()

From 901c76bd03794c2d79279779a06ba60ce77b796d Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 23 Feb 2024 15:25:16 +0800
Subject: [PATCH 30/82] [PIR+CINN]Fix Convert0DTo1D Pass bug in CombineOp
 (#61977)

---
 .../group_merge/convert_0d_to_1d_pass.cc      | 35 +++++++++++++++++++
 .../pir/cinn/sub_graphs/test_sub_graph_23.py  |  8 ++---
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.cc
index f60878a9e1d99..325421d92abe6 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/convert_0d_to_1d_pass.cc
@@ -58,6 +58,40 @@ class FullOpPattern : public pir::OpRewritePattern<paddle::dialect::FullOp> {
   }
 };
 
+class CombineOpPattern : public pir::OpRewritePattern<pir::CombineOp> {
+ public:
+  using pir::OpRewritePattern<pir::CombineOp>::OpRewritePattern;
+
+  bool Match(pir::CombineOp op) const override {
+    auto out_type = op.result(0).type().dyn_cast<pir::VectorType>();
+    for (auto type : out_type.data()) {
+      if (HasZeroDim(type)) return true;
+    }
+    return false;
+  }
+
+  void Rewrite(pir::CombineOp op,
+               pir::PatternRewriter &rewriter) const override {
+    pir::Builder builder(rewriter.ir_context());
+
+    const std::vector<pir::Type> inputs_type = [&]() {
+      std::vector<pir::Type> types;
+      for (auto value : op->operands_source()) {
+        types.push_back(value.type());
+      }
+      return types;
+    }();
+    op.result(0).set_type(builder.vec_type(inputs_type));
+  }
+
+ private:
+  bool HasZeroDim(pir::Type type) const {
+    if (!type) return false;
+    const auto dense_tensor_type = type.dyn_cast<pir::DenseTensorType>();
+    return dense_tensor_type && (dense_tensor_type.dims().size() == 0U);
+  }
+};
+
 class Convert0DTo1DPass : public pir::PatternRewritePass {
  public:
   Convert0DTo1DPass() : pir::PatternRewritePass("convert_0D_to_1D", 1) {}
@@ -65,6 +99,7 @@ class Convert0DTo1DPass : public pir::PatternRewritePass {
   pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
     pir::RewritePatternSet ps(context);
     ps.Add<FullOpPattern>(context);
+    ps.Add<CombineOpPattern>(context);
 
     return ps;
   }
diff --git a/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py b/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
index 0d140fda01484..5f04f7b0f9bd2 100644
--- a/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
+++ b/test/ir/pir/cinn/sub_graphs/test_sub_graph_23.py
@@ -31,8 +31,7 @@ def forward(
         var_0,  # (shape: [11, 24, 56, 56], dtype: paddle.float32, stop_gradient: False)
         var_1,  # (shape: [11, 24, 56, 56], dtype: paddle.float32, stop_gradient: False)
     ):
-        var_2 = paddle.tensor.attribute.shape(var_0)
-        var_3 = var_2[0]
+        var_3 = var_0.shape[0]
         var_4 = paddle.tensor.random.rand(shape=[var_3, 1, 1, 1])
         var_5 = 0.975 + var_4
         var_6 = paddle.tensor.ops.floor(var_5)
@@ -65,16 +64,15 @@ def train(self, net, to_static, with_prim=False, with_cinn=False):
         outs = net(*self.inputs)
         return outs
 
-    # NOTE prim + cinn lead to error
     def test_ast_prim_cinn(self):
         st_out = self.train(self.net, to_static=True)
         cinn_out = self.train(
-            self.net, to_static=True, with_prim=True, with_cinn=False
+            self.net, to_static=True, with_prim=True, with_cinn=True
         )
         for st, cinn in zip(
             paddle.utils.flatten(st_out), paddle.utils.flatten(cinn_out)
         ):
-            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-8)
+            np.testing.assert_allclose(st.numpy(), cinn.numpy(), atol=1e-6)
 
 
 if __name__ == '__main__':

From 4c173f64ab76c212e1c810e11881f51655c00a59 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Fri, 23 Feb 2024 15:30:17 +0800
Subject: [PATCH 31/82] [Prim][PIR] add llama if graph dy shape case (#61986)

* add llama if case

* fix code
---
 test/ir/pir/cinn/symbolic/CMakeLists.txt      |  12 ++
 test/ir/pir/cinn/symbolic/test_llama_if_dy.py | 112 ++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 test/ir/pir/cinn/symbolic/test_llama_if_dy.py

diff --git a/test/ir/pir/cinn/symbolic/CMakeLists.txt b/test/ir/pir/cinn/symbolic/CMakeLists.txt
index 08e1d9d33d456..8d9463d870fda 100644
--- a/test/ir/pir/cinn/symbolic/CMakeLists.txt
+++ b/test/ir/pir/cinn/symbolic/CMakeLists.txt
@@ -10,6 +10,7 @@ if(WITH_GPU)
     test_cinn_reduce_symbolic_demo.py
     test_if_st.py
     test_if_dy.py
+    test_llama_if_dy.py
     test_sub_graph_for_backend.py
     test_sub_graph_for_frontend.py
     test_check_infer_symbolic.py
@@ -57,6 +58,17 @@ if(WITH_GPU)
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
   set_tests_properties(test_if_dy PROPERTIES LABELS "RUN_TYPE=CINN")
 
+  add_test(
+    NAME test_llama_if_dy
+    COMMAND
+      ${CMAKE_COMMAND} -E env
+      PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_BINARY_DIR}/python/:$ENV{PYTHONPATH}
+      FLAGS_prim_all=true FLAGS_cinn_bucket_compile=false
+      FLAGS_pir_apply_shape_optimization_pass=0 FLAGS_enable_pir_api=1
+      ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/test_llama_if_dy.py
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+  set_tests_properties(test_llama_if_dy PROPERTIES LABELS "RUN_TYPE=CINN")
+
   add_test(
     NAME test_cinn_reduce_symbolic_demo
     COMMAND
diff --git a/test/ir/pir/cinn/symbolic/test_llama_if_dy.py b/test/ir/pir/cinn/symbolic/test_llama_if_dy.py
new file mode 100644
index 0000000000000..7d2c338797260
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_llama_if_dy.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+
+def apply_to_static(net, use_cinn, input_spec=None):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(
+        net,
+        input_spec=input_spec,
+        build_strategy=build_strategy,
+        full_graph=True,
+    )
+
+
+class PrepareDecoderAttentionMask(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    # [batch_size, src_length] -> [batch_size, 1, tgt_length, src_length]
+    def _expand_2d_mask(self, mask, target_length):
+        batch_size, src_length = mask.shape[0], mask.shape[-1]
+
+        mask = mask[:, None, None, :].astype("bool")
+        mask.stop_gradient = True
+        expanded_mask = mask.expand([batch_size, 1, target_length, src_length])
+
+        return expanded_mask
+
+    def _make_causal_mask(self, input_ids_shape):
+        batch_size, seq_len = input_ids_shape
+
+        mask = paddle.tril(paddle.ones((seq_len, seq_len), dtype="bool"))
+
+        # [bs, 1, seq_len, seq_len]
+        return mask[None, None, :, :].expand([batch_size, 1, seq_len, seq_len])
+
+    def forward(self, input_ids, attention_mask):
+        input_shape = paddle.shape(input_ids)
+
+        expanded_attn_mask = self._expand_2d_mask(
+            attention_mask, target_length=input_shape[-1]
+        )
+        combined_attention_mask = self._make_causal_mask(input_shape)
+        if input_shape[-1] > 1:
+            expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+        expanded_attn_mask = paddle.where(
+            expanded_attn_mask, 0.0, paddle.finfo("float32").min
+        ).astype("float32")
+        return expanded_attn_mask
+
+
+class TestPrepareDecoderAttentionMask(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.input_ids = paddle.randint(
+            low=0, high=2048, shape=[1, 2048], dtype="int64"
+        )
+        self.input_ids.stop_gradient = False
+
+        self.attention_mask = paddle.ones([1, 2048], dtype="bool")
+        self.attention_mask.stop_gradient = False
+
+    def eval(self, use_cinn=False, mode="static"):
+        net = PrepareDecoderAttentionMask()
+        input_spec = [
+            InputSpec(shape=[None, None], dtype="int64"),
+            InputSpec(shape=[None, None], dtype="bool"),
+        ]
+        if mode == "static":
+            net = apply_to_static(net, use_cinn, input_spec)
+            net.eval()
+        out = net(self.input_ids, self.attention_mask)
+        return out
+
+    def test_eval(self):
+        eager_outs = self.eval(mode="eager")
+        dy_outs = self.eval(use_cinn=False)
+
+        for cinn_out, dy_out in zip(eager_outs, dy_outs):
+            np.testing.assert_allclose(
+                cinn_out.numpy(), dy_out.numpy(), atol=1e-8
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 18df00746ecbb65b60cec6cdd1cf50b6875b6e37 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Fri, 23 Feb 2024 15:31:51 +0800
Subject: [PATCH 32/82] Modify the judgment conditions of the
 PADDLE_CUDA_INSTALL_REQUIREMENTS option (#61973)

* fix

* update
---
 python/setup.py.in | 2 +-
 setup.py           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 520a9f7f7a56c..329f092d44801 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -399,7 +399,7 @@ def get_paddle_extra_install_requirements():
     paddle_cuda_install_requirements = os.getenv(
         "PADDLE_CUDA_INSTALL_REQUIREMENTS", None
     )
-    if paddle_cuda_install_requirements is not None:
+    if paddle_cuda_install_requirements == "ON":
         PADDLE_CUDA_INSTALL_REQUIREMENTS = {
             "V11": (
                 "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
diff --git a/setup.py b/setup.py
index 350c62bdf6301..f19c22f909d07 100644
--- a/setup.py
+++ b/setup.py
@@ -927,7 +927,7 @@ def get_paddle_extra_install_requirements():
     paddle_cuda_install_requirements = os.getenv(
         "PADDLE_CUDA_INSTALL_REQUIREMENTS", None
     )
-    if paddle_cuda_install_requirements is not None:
+    if paddle_cuda_install_requirements == "ON":
         PADDLE_CUDA_INSTALL_REQUIREMENTS = {
             "V11": (
                 "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "

From 139db934339a67e57c58ca221cfcafdbcfc2b006 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 15:32:17 +0800
Subject: [PATCH 33/82]  Fix some typos(dynamicly, etc) (#61955)

---
 python/paddle/amp/grad_scaler.py              | 36 +++++++--------
 python/paddle/autograd/autograd.py            |  6 +--
 python/paddle/autograd/backward_utils.py      |  2 +-
 python/paddle/autograd/ir_backward.py         |  2 +-
 python/paddle/base/compiler.py                |  6 +--
 python/paddle/base/core.py                    |  2 +-
 python/paddle/base/device_worker.py           |  2 +-
 python/paddle/base/dygraph/base.py            |  4 +-
 python/paddle/base/dygraph/tracer.py          |  2 +-
 python/paddle/base/framework.py               | 12 ++---
 python/paddle/base/layers/math_op_patch.py    |  8 ++--
 python/paddle/base/unique_name.py             |  2 +-
 python/paddle/base/variable_index.py          | 12 ++---
 python/paddle/framework/io.py                 | 12 ++---
 python/paddle/hapi/static_flops.py            |  4 +-
 python/paddle/incubate/asp/asp.py             | 30 ++++++------
 python/paddle/incubate/autograd/functional.py | 10 ++--
 .../distributed/utils/io/save_for_auto.py     |  8 ++--
 .../transformers/call_transformer.py          |  2 +-
 .../transformers/ifelse_transformer.py        |  4 +-
 .../jit/dy2static/transformers/utils.py       |  2 +-
 python/paddle/jit/sot/symbolic/interpreter.py |  2 +-
 python/paddle/nn/functional/vision.py         |  2 +-
 python/paddle/nn/initializer/Bilinear.py      |  2 +-
 python/paddle/nn/initializer/assign.py        |  2 +-
 python/paddle/nn/initializer/kaiming.py       |  6 +--
 python/paddle/nn/initializer/normal.py        |  2 +-
 python/paddle/nn/initializer/xavier.py        |  2 +-
 python/paddle/nn/layer/activation.py          | 10 ++--
 python/paddle/nn/layer/loss.py                | 46 +++++++++----------
 python/paddle/nn/layer/norm.py                | 14 +++---
 python/paddle/nn/layer/pooling.py             |  6 +--
 python/paddle/profiler/profiler.py            |  2 +-
 .../quantization/imperative/fuse_utils.py     |  2 +-
 .../quantization/imperative/ptq_registry.py   |  2 +-
 python/paddle/tensor/manipulation.py          |  2 +-
 36 files changed, 135 insertions(+), 135 deletions(-)

diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 98053bd4d6398..3ba6f28fd4467 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -62,7 +62,7 @@ class AmpScaler:
                                 steps with finite gradients. Default is 1000.
         decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                     accumulated steps with nan or inf gradients. Default is 2.
-        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
+        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamically. Default is True.
     Returns:
         An AmpScaler object.
 
@@ -258,7 +258,7 @@ def minimize(self, optimizer, *args, **kwargs):
                 self._cache_founf_inf = False
 
         if self._use_dynamic_loss_scaling:
-            # uopdate the scale
+            # update the scale
             self._update()
 
         self._optimizer_states = defaultdict(_refresh_optimizer_state)
@@ -412,7 +412,7 @@ def is_use_dynamic_loss_scaling(self):
         Whether to use dynamic loss scaling.
 
         Returns:
-            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamicly return true.
+            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamically return true.
         """
         return self._use_dynamic_loss_scaling
 
@@ -420,7 +420,7 @@ def get_init_loss_scaling(self):
         """
         Return the initial loss scaling factor.
 
-        Reurns:
+        Returns:
             float:  the initial loss scaling factor.
         """
         return self._init_loss_scaling
@@ -441,7 +441,7 @@ def get_incr_ratio(self):
         """
         Return the multiplier to use when increasing the loss scaling.
 
-        Reurns:
+        Returns:
             float:  the multiplier to use when increasing the loss scaling.
         """
         return self._incr_ratio
@@ -460,7 +460,7 @@ def get_decr_ratio(self):
         """
         Get the less-than-one-multiplier to use when decreasing the loss scaling.
 
-        Reurns:
+        Returns:
             float:  the less-than-one-multiplier to use when decreasing the loss scaling.
         """
         return self._decr_ratio
@@ -479,7 +479,7 @@ def get_incr_every_n_steps(self):
         """
         Return the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
 
-        Reurns:
+        Returns:
             int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
         """
         return self._incr_every_n_steps
@@ -497,7 +497,7 @@ def get_decr_every_n_nan_or_inf(self):
         """
         Return the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
 
-        Reurns:
+        Returns:
             int:  the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
         """
         return self._decr_every_n_nan_or_inf
@@ -515,7 +515,7 @@ def state_dict(self):
         """
         Returns the state of the scaler as a `dict`, If this instance is not enabled, returns an empty dict.
 
-        Reurns:
+        Returns:
             A dict of scaler includes:
             scale (tensor): The loss scaling factor.
             incr_ratio(float): The multiplier to use when increasing the loss scaling.
@@ -524,7 +524,7 @@ def state_dict(self):
             decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients.
             incr_count(int): The number of recent consecutive unskipped steps.
             decr_count(int): The number of recent consecutive skipped steps.
-            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
+            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamically. Default is True.
         """
         return (
             {
@@ -597,7 +597,7 @@ class GradScaler(AmpScaler):
                                 steps with finite gradients. Default is 2000.
         decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n
                                     accumulated steps with nan or inf gradients. Default is 1.
-        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
+        use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamically. Default is True.
     Returns:
         An GradScaler object.
 
@@ -869,7 +869,7 @@ def is_use_dynamic_loss_scaling(self):
         Whether to use dynamic loss scaling.
 
         Returns:
-            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamicly return true.
+            bool: if fixed loss_scaling is used return False, if the loss scaling is updated dynamically return true.
 
         Examples:
             .. code-block:: python
@@ -895,7 +895,7 @@ def get_init_loss_scaling(self):
         """
         Return the initial loss scaling factor.
 
-        Reurns:
+        Returns:
             float:  the initial loss scaling factor.
 
         Examples:
@@ -952,7 +952,7 @@ def get_incr_ratio(self):
         """
         Return the multiplier to use when increasing the loss scaling.
 
-        Reurns:
+        Returns:
             float:  the multiplier to use when increasing the loss scaling.
 
         Examples:
@@ -1009,7 +1009,7 @@ def get_decr_ratio(self):
         """
         Get the less-than-one-multiplier to use when decreasing the loss scaling.
 
-        Reurns:
+        Returns:
             float:  the less-than-one-multiplier to use when decreasing the loss scaling.
 
         Examples:
@@ -1066,7 +1066,7 @@ def get_incr_every_n_steps(self):
         """
         Return the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
 
-        Reurns:
+        Returns:
             int:  the num `n`, `n` represent increases loss scaling every `n` consecutive steps with finite gradients.
 
         Examples:
@@ -1123,7 +1123,7 @@ def get_decr_every_n_nan_or_inf(self):
         """
         Return the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
 
-        Reurns:
+        Returns:
             int:  the num `n`, `n` represent decreases loss scaling every `n` accumulated steps with nan or inf gradients.
 
         Examples:
@@ -1189,7 +1189,7 @@ def state_dict(self):
             decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients.
             incr_count(int): The number of recent consecutive unskipped steps.
             decr_count(int): The number of recent consecutive skipped steps.
-            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
+            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamically. Default is True.
 
 
         Examples:
diff --git a/python/paddle/autograd/autograd.py b/python/paddle/autograd/autograd.py
index 93e0a845908b1..1a1d5a8d66611 100644
--- a/python/paddle/autograd/autograd.py
+++ b/python/paddle/autograd/autograd.py
@@ -48,7 +48,7 @@ class Jacobian:
 
     Notes:
 
-        Eclipsis index is not supported currently.
+        Ellipsis index is not supported currently.
 
     Args:
 
@@ -495,7 +495,7 @@ def jacobian(
 
     Returns:
 
-        Union[Tuple[Tuple[Jacobian, ...], ...], Tuple[Jacobian, ...], Jacobian]: Jacobian(s) of ys deriveted from xs.
+        Union[Tuple[Tuple[Jacobian, ...], ...], Tuple[Jacobian, ...], Jacobian]: Jacobian(s) of ys derived from xs.
 
     Examples:
 
@@ -579,7 +579,7 @@ def hessian(
 
     Returns:
 
-        Union[Tuple[Tuple[Hessian, ...], ...], Tuple[Hessian, ...], Hessian]: Hessian(s) of ys deriveted from xs.
+        Union[Tuple[Tuple[Hessian, ...], ...], Tuple[Hessian, ...], Hessian]: Hessian(s) of ys derived from xs.
 
     Examples:
 
diff --git a/python/paddle/autograd/backward_utils.py b/python/paddle/autograd/backward_utils.py
index e3e6326ba61cc..f0d90d08426d3 100644
--- a/python/paddle/autograd/backward_utils.py
+++ b/python/paddle/autograd/backward_utils.py
@@ -188,7 +188,7 @@ def __init__(self, block):
         self.opgrad_to_op = collections.defaultdict(list)
         # only for controlflow
         # inside_value is sub block value, which will yield to parent block,
-        # parant block value is outside_value
+        # parent block value is outside_value
         self.inside_value_to_outside_value_map = ValueDict()
 
     def turn_map(self) -> None:
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index 9c751f82238fa..042a541eb69f5 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -555,7 +555,7 @@ def append_yield(
     # there are four patterns:
     # [builtin.combine , op1] (op1's one input is vectorType, outputs are not vectorType)
     # [op2 , builtin.split] (op2's inputs are not vectorType, one output is vectorType)
-    # [builtin.combine , op3 , buitin.split] (op3's one input and one output are vectorType)
+    # [builtin.combine , op3 , builtin.split] (op3's one input and one output are vectorType)
     # [op4] (op4's inputs and outputs are not vectorType)
 
     # -----------------only for control flow-----------------#
diff --git a/python/paddle/base/compiler.py b/python/paddle/base/compiler.py
index 79aae31a1b147..7b8646eb00b70 100644
--- a/python/paddle/base/compiler.py
+++ b/python/paddle/base/compiler.py
@@ -495,7 +495,7 @@ def func_compile():
 
     @staticmethod
     def patch_program_cache(ipu_strategy):
-        """Monkey patch ProgramCache discriptor to support dynamic2static in IPU.
+        """Monkey patch ProgramCache descriptor to support dynamic2static in IPU.
 
         Args:
             ipu_strategy: The ipu_strategy used in dynamic graph.
@@ -528,7 +528,7 @@ def patch_getter(self, item):
                     )
                 if self._caches and not ipu_strategy.need_compile:
                     logging_utils.warn(
-                        "dynamic2static on IPU doesn't support mutiple caches. Please make sure"
+                        "dynamic2static on IPU doesn't support multiple caches. Please make sure"
                         "dynamic inputs is not used."
                     )
                 concrete_program, _ = self._build_once(item)
@@ -751,7 +751,7 @@ def set_graph_config(
             num_ipus (int, optional): Number of IPU devices. Default 1, which means only use 1 IPU.
             is_training (bool, optional): True is training graph, False is inference graph. Default True, which means is training mode.
             batch_size (int, optional): The batch-size in the graph. Used to make the graph batch-size fixed,
-                if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamice.
+                if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamic.
             enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True.
                 Default False, which means disabled.
 
diff --git a/python/paddle/base/core.py b/python/paddle/base/core.py
index fcb1f5605e823..765c63fd2d6d0 100644
--- a/python/paddle/base/core.py
+++ b/python/paddle/base/core.py
@@ -249,7 +249,7 @@ def to_list(s):
 # NOTE(zhiqiu): An error may occurs when import paddle in linux platform with glibc < 2.22,
 # the error message of which is "dlopen: cannot load any more object with static TLS".
 # This happens when:
-# (1) the number of dynamic shared librarys (DSO) loaded > 14,
+# (1) the number of dynamic shared libraries (DSO) loaded > 14,
 # (2) after that, load a dynamic shared library (DSO) with static TLS.
 # For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs.
 # So, here is a tricky way to solve the problem by pre load 'libgomp' before 'libpaddle.so'.
diff --git a/python/paddle/base/device_worker.py b/python/paddle/base/device_worker.py
index c20677f6acd5e..26b351befda2f 100644
--- a/python/paddle/base/device_worker.py
+++ b/python/paddle/base/device_worker.py
@@ -629,7 +629,7 @@ def _gen_worker_desc(self, trainer_desc):
         schedule_mode_str = pipeline_opt["schedule_mode"]
         # F-then-B scheduler which runs Forward phase for all microbatches,
         # then runs Backward phase for all microbatches.
-        # 1F1B scheduler, which runs forward phase and backward phase altertively
+        # 1F1B scheduler, which runs forward phase and backward phase alternatively
         # after startup phase.
         assert schedule_mode_str in ["F-then-B", "1F1B"], (
             "The schedule mode " "for pipeline must be one of F-then-B or 1F1B"
diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py
index 1e20398440bec..4f233cfe4d671 100644
--- a/python/paddle/base/dygraph/base.py
+++ b/python/paddle/base/dygraph/base.py
@@ -925,11 +925,11 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
             framework._current_expected_place(), framework.core.CPUPlace
         ):
             # TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace.
-            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy.
+            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not satisfy.
             # Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
             # (2): when used in flask framework, it may result in hang.
             # Details: https://github.com/PaddlePaddle/Paddle/issues/26635
-            # So, we temporally diable the zero_copy strategy.
+            # So, we temporally disable the zero_copy strategy.
             if zero_copy is True:
                 warnings.warn(
                     "Currently, zero_copy is not supported, and it will be discarded."
diff --git a/python/paddle/base/dygraph/tracer.py b/python/paddle/base/dygraph/tracer.py
index 4df9517073c66..966004e5035f4 100644
--- a/python/paddle/base/dygraph/tracer.py
+++ b/python/paddle/base/dygraph/tracer.py
@@ -142,7 +142,7 @@ def eager_legacy_trace_op(
                     assert out_name in outputs.keys()
                     num_outs = len(outputs[out_name])
                     arg_to_append = num_outs
-                # NOTE(dev): For MasterParam/MasterParamOut in optimzer op
+                # NOTE(dev): For MasterParam/MasterParamOut in optimizer op
                 elif "Var" in arg_name[-3:]:
                     out_name = arg_name[:-3]
                     print(out_name)
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 8fe13c16bdd0b..84077b768b995 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -4430,7 +4430,7 @@ def create_parameter(self, *args, **kwargs):
         else:
             param = Parameter(global_block, *args, **kwargs)
         # NOTE(Aurelius84): we deliver stop_gradient in append_op, so we
-        # need recorde it state and reset it back after calling this API
+        # need record it state and reset it back after calling this API
         stop_gradient = param.stop_gradient
 
         if 'initializer' in kwargs:
@@ -6588,7 +6588,7 @@ def _prune_with_input(self, feeded_var_names, targets):
                         "Variable or Operator, but received %s." % type(t)
                     )
 
-                # NOTEZ(zhiqiu): For variable to be fed in fetch_list, there two cases:
+                # NOTE(zhiqiu): For variable to be fed in fetch_list, there two cases:
                 # (1) the variable is leaf, it has no op that generates it;
                 # (2) the variable is not leaf, and we need to prune the op that generates it.
                 # In both cases, wo can just skip target_op of that it.
@@ -6810,7 +6810,7 @@ def parse_from_string(binary_str):
 
         Args:
 
-            binary_str_type (str): the binary prootbuf string.
+            binary_str_type (str): the binary protobuf string.
 
         Returns:
             Program: A deserialized Program.
@@ -7198,7 +7198,7 @@ def all_parameters(self):
         Get all :ref:`api_guide_parameter_en` from this Program. A list object is returned.
 
         Returns:
-            list[ :ref:`api_guide_parameter_en` ]: The list contians all parameters in this program.
+            list[ :ref:`api_guide_parameter_en` ]: The list contains all parameters in this program.
 
         Examples:
             .. code-block:: python
@@ -7250,7 +7250,7 @@ def state_dict(self, mode='all', scope=None):
                 obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
                 Default: None
 
-        Retruns:
+        Returns:
             dict: a dict contains the parameters and persistable buffers.
 
         Examples:
@@ -7274,7 +7274,7 @@ def state_dict(self, mode='all', scope=None):
                 >>> paddle.save(prog.state_dict(), path)
         """
         # The 'framework' is a low-level module, and 'executor'
-        # can not be imported at the begainning of this file.
+        # can not be imported at the beginning of this file.
         # Therefore, the above two modules are dynamically imported.
         from .executor import global_scope
 
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index e4b9ed5198a9e..00d0faaedd0dd 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -258,7 +258,7 @@ def place(self):
         """
         Variable don't have 'place' interface in static graph mode
         But this interface can greatly facilitate dy2static.
-        So we give a warnning here and return None.
+        So we give a warning here and return None.
         """
         warnings.warn(
             "Variable do not have 'place' interface for static graph mode, try not to use it. None will be returned."
@@ -269,7 +269,7 @@ def contiguous(self):
         """
         Variable don't have 'contiguous' interface in static graph mode
         But this interface can greatly facilitate dy2static.
-        So we give a warnning here and return None.
+        So we give a warning here and return None.
         """
         warnings.warn(
             "Variable do not have 'contiguous' interface for static graph mode, try not to use it. self will be returned."
@@ -281,7 +281,7 @@ def is_contiguous(self):
         """
         Variable don't have 'is_contiguous' interface in static graph mode
         But this interface can greatly facilitate dy2static.
-        So we give a warnning here and return None.
+        So we give a warning here and return None.
         """
         warnings.warn(
             "Variable do not have 'is_contiguous' interface for static graph mode, try not to use it. True will be returned."
@@ -360,7 +360,7 @@ def append(self, var):
         """
         if not isinstance(var, Variable):
             if in_to_static_mode():
-                """in dy2static mode, x may be tensorable values such as int, float, np.array"""
+                """In dy2static mode, x may be tensor values such as int, float, np.array"""
                 from paddle.tensor.creation import to_tensor
 
                 var = to_tensor(var)
diff --git a/python/paddle/base/unique_name.py b/python/paddle/base/unique_name.py
index 95acd00cc60ea..9541d411078aa 100644
--- a/python/paddle/base/unique_name.py
+++ b/python/paddle/base/unique_name.py
@@ -211,7 +211,7 @@ def guard(new_generator=None):
 
     Args:
         new_generator(str|bytes, optional): New name of global namespace. Note that str
-            in Python2 was spilted into str and bytes in Python3, so here are two
+            in Python2 was splitted into str and bytes in Python3, so here are two
             types. Default is None. If not None, new_generator will be added into
             the prefix of unique name generated by :code:`generate()`.
 
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index dd202bfb93d13..6ccfe1c6164d2 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -173,12 +173,12 @@ def deal_advanced_index(
     Transpose origin Tensor and advanced indices to the front.
 
     Returns:
-        transed_tensor (Tensor): transposed tensor, corresbonding with advanced indices
-        transed_index (List): advanced indices transed to the front
+        transed_tensor (Tensor): transposed tensor, corresponding with advanced indices
+        transed_index (List): advanced indices transposed to the front
         trans_back_dim (List): order of axes to transpose back to original order. Only used in __setitem__.
         pos_of_new_dim (int):  axis of new dim in the result. Only used in __getitem__.
         rank_of_new_dim (int): rank of new dim in the result. Only used in __getitem__.
-        transed_value_tensor (Tensor): value tensor transed to the front. Only used in __setitem__.
+        transed_value_tensor (Tensor): value tensor transposed to the front. Only used in __setitem__.
     """
     transed_dim = []
     transed_index = []
@@ -771,7 +771,7 @@ def get_tensor_with_basic_indexing(
             else:
                 stride = attrs['strides']
             if use_strided_slice:
-                # TODO(zoooo0820): suppport strided_slice_array until PIR API is ready
+                # TODO(zoooo0820): support strided_slice_array until PIR API is ready
 
                 out = paddle._C_ops.strided_slice(x, axes, st, end, stride)
                 if len(decrease_axes) > 0:
@@ -883,7 +883,7 @@ def _getitem_static(x, indices):
             _,
         ) = deal_advanced_index(out, advanced_index, False, None)
 
-        # TODO(zooooo0820): Replacing gather_nd to another advanded OP for handling of mixed indexes more efficiently
+        # TODO(zooooo0820): Replacing gather_nd to another advanced OP for handling of mixed indexes more efficiently
         if len(adjusted_advanced_index) == 1 and adjusted_advanced_index[
             0
         ].dtype in (paddle.bool, paddle.base.libpaddle.BOOL):
@@ -919,7 +919,7 @@ def _getitem_static(x, indices):
 
 def parse_bool_and_broadcast_indices(indices):
     # deal with multiple Tensors and translating bool tensor to int tensor.
-    # In static mode, bool-tensor cannot be broadcasted since its corressponding int tensor's shape cannot be infered.
+    # In static mode, bool-tensor cannot be broadcasted since its corresponding int tensor's shape cannot be infered.
     for i, indice in enumerate(indices):
         if (
             indice.dtype == paddle.bool
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 8c794b4ff2ef0..c0015f6704a88 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -68,7 +68,7 @@ def async_save(obj, path, protocol=4, sync_other_task=False, **configs):
     Note:
         currently only support dygraph mode.
     Note:
-        any argument passed through configs will be overrided by default setting.
+        any argument passed through configs will be overridden by default setting.
     Args:
         obj(Object) : The object to be saved.
         path(str|BytesIO) : The path/buffer of the object to be saved.
@@ -76,7 +76,7 @@ def async_save(obj, path, protocol=4, sync_other_task=False, **configs):
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 4
         sync_other_task(bool) : Determine whether to wait other async save task to be finished before this one be put in queue.
-        **configs(dict, optional): compatible argument to paddle.save, but will be overrided by default setting.
+        **configs(dict, optional): compatible argument to paddle.save, but will be overridden by default setting.
     Examples:
         .. code-block:: python
             :name: code-example-1
@@ -98,7 +98,7 @@ def async_save(obj, path, protocol=4, sync_other_task=False, **configs):
         )
     if len(configs) > 0:
         warnings.warn(
-            "configs are not supported in async mode, will be overided by default settings."
+            "configs are not supported in async mode, will be overridden by default settings."
         )
 
     # TODO: make this part async
@@ -596,13 +596,13 @@ def tuple_to_tensor(obj):
     def ndarray_to_tensor(obj):
         return _ndarray_to_tensor(obj, return_numpy=return_numpy)
 
-    # tuple(name, ndarry) was converted from varbase of paddle2.1,
-    # and all tuple(name, ndarry) are converted to tensor.
+    # tuple(name, ndarray) was converted from varbase of paddle2.1,
+    # and all tuple(name, ndarray) are converted to tensor.
     if _contain_x(obj, _transformed_from_varbase):
         return _parse_every_object(
             obj, _transformed_from_varbase, tuple_to_tensor
         )
-    # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0
+    # If there is no tuple(name, ndarray), it is considered to be saved by paddle2.0
     # or converted from LoDTensor, and all ndarrays are converted to tensor.
     else:
         return _parse_every_object(
diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py
index 6d9209ae76a4a..a627dbb68ea4a 100644
--- a/python/paddle/hapi/static_flops.py
+++ b/python/paddle/hapi/static_flops.py
@@ -55,7 +55,7 @@ def type(self):
 
     def inputs(self, name):
         """
-        Get all the varibales by the input name.
+        Get all the variables by the input name.
         """
         if name in self._op.input_names:
             return [
@@ -66,7 +66,7 @@ def inputs(self, name):
 
     def outputs(self, name):
         """
-        Get all the varibales by the output name.
+        Get all the variables by the output name.
         """
         return [self._graph.var(var_name) for var_name in self._op.output(name)]
 
diff --git a/python/paddle/incubate/asp/asp.py b/python/paddle/incubate/asp/asp.py
index a1de1937c70cd..fbe1eac9b9d26 100644
--- a/python/paddle/incubate/asp/asp.py
+++ b/python/paddle/incubate/asp/asp.py
@@ -106,7 +106,7 @@ def set_excluded_layers(param_names, main_program=None):
             ...     prob = my_layer(input_data)
             ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
             ...
-            ...     # Setup exluded layers out from ASP workflow.
+            ...     # Setup excluded layers out from ASP workflow.
             ...     # Please note, excluded_layers must be set before calling optimizer.minimize().
             ...     paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
             ...
@@ -126,7 +126,7 @@ def set_excluded_layers(param_names, main_program=None):
 
 def reset_excluded_layers(main_program=None):
     r"""
-    Reset exculded layers setting corresponding to :attr:`main_program`. If :attr:`main_program`
+    Reset excluded layers setting corresponding to :attr:`main_program`. If :attr:`main_program`
     is None, then all configurations of excluded_layers would be cleaned.
 
     Args:
@@ -196,7 +196,7 @@ def reset_excluded_layers(main_program=None):
             ...     prob = my_layer(input_data)
             ...     loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
             ...
-            ...     # Setup exluded layers out from ASP workflow.
+            ...     # Setup excluded layers out from ASP workflow.
             ...     # Please note, excluded_layers must be set before calling optimizer.minimize().
             ...     paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
             ...     # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
@@ -216,7 +216,7 @@ def reset_excluded_layers(main_program=None):
 def decorate(optimizer):
     r"""
     Wrap the given optimizer as a OptimizerWithSparsityGuarantee,
-    If runnig with dynamic graph mode. ASP would creates mask variables for supported parameters.
+    If running with dynamic graph mode. ASP would creates mask variables for supported parameters.
     Else if in static graph mode, ASP would creates mask variables and inserts necessary ops
     when calling minimize()
 
@@ -310,15 +310,15 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
     *Note*: (Static graph mode) If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize`
     and initialization (`exe.run(startup_program`)) before (For successfully obtain mask Variable).
     Typically set `with_mask` as true for training (have called `OptimizerWithSparsityGuarantee.minimize`) and false for
-    inference only. To obtain OptimizerWithSparsityGuarantee, please see `paddle.incubate.asp.decoreate()`.
+    inference only. To obtain OptimizerWithSparsityGuarantee, please see `paddle.incubate.asp.decorate()`.
 
     Args:
         model (Program|nn.Layer): Program with model definition and its parameters, or a object of `paddle.nn.Layer`.
         n (int, optional): n of `n:m` sparse pattern. Default is 2.
         m (int, optional): m of `n:m` sparse pattern. Default is 4.
-        mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
-                                      The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
-        with_mask (bool, optional): To prune mask Variables related to parameters or not. True is purning also, False is not. Default is True.
+        mask_algo (string, optional): The function name to generate sparse mask. Default is `mask_1d`.
+                                      The valid inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
+        with_mask (bool, optional): To prune mask Variables related to parameters or not. True is pruning also, False is not. Default is True.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
@@ -476,9 +476,9 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
 
 class ProgramASPInfo:
     r"""
-    ProgramASPInfo is a container to keep ASP relevant information of Pragrom. It contains three inner-variables:
-    1. __mask_vars (Dictionary): Key is parameter's name and vaule is its corresponding sparse mask Variable object, which is created by `ASPHelper.create_mask_variables`.
-    2. __masks (Dictionary): Key is parameter's name and vaule is its corressponding sparse mask Numpy array, which is created by `ASPHelper.prune_model`.
+    ProgramASPInfo is a container to keep ASP relevant information of Program. It contains three inner-variables:
+    1. __mask_vars (Dictionary): Key is parameter's name and value is its corresponding sparse mask Variable object, which is created by `ASPHelper.create_mask_variables`.
+    2. __masks (Dictionary): Key is parameter's name and value is its corresponding sparse mask Numpy array, which is created by `ASPHelper.prune_model`.
     3. __excluded_layers (List): It stores name of layers which should not involve into ASP workflow.
     """
 
@@ -552,7 +552,7 @@ def decorate(optimizer):
         if paddle.in_dynamic_mode():
             # main_prog and startup_prog would be used with paddle.static.program_guard
             # to create ASP masks. Moreover, main_prog is a key to map paddle.static.Program
-            # to its own ASP informantion, like ASP mask variables. For dynamic graph, we use
+            # to its own ASP information, like ASP mask variables. For dynamic graph, we use
             # default_main_program as the key.
             main_prog = paddle.static.default_main_program()
             startup_prog = paddle.static.default_startup_program()
@@ -809,7 +809,7 @@ def _minimize(
         3. Insert masking ops in the end of parameters update.
 
         *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`.
-        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph
+        (Due to there is a invisible graphs optimization in `Fleet.minimize()` which make training graph
         cannot be modified anymore.)
 
         Args:
@@ -849,7 +849,7 @@ def _step(cls, optimizer):
         2. Mask parameters with sparse masks.
 
         *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`.
-        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph
+        (Due to there is a invisible graphs optimization in `Fleet.minimize()` which make training graph
         cannot be modified anymore.)
 
         Args:
@@ -963,7 +963,7 @@ def step(self):
         2. Mask parameters with sparse masks.
 
         *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`.
-        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph
+        (Due to there is a invisible graphs optimization in `Fleet.minimize()` which make training graph
         cannot be modified anymore.)
 
         Args:
diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py
index 5f4f5c6a76f17..8bec01b1c39ae 100644
--- a/python/paddle/incubate/autograd/functional.py
+++ b/python/paddle/incubate/autograd/functional.py
@@ -577,9 +577,9 @@ def _grad(ys, xs, v=None):
             inputs.
     """
     if framework.in_dygraph_mode():
-        # paddle.grad returns a list though the inputs is a signle Tensor. The
+        # paddle.grad returns a list though the inputs is a single Tensor. The
         # follow code snippet fixes the problem by return the first element of
-        # xs_grad when the xs is a signle Tensor.
+        # xs_grad when the xs is a single Tensor.
         xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True)
         if (
             isinstance(xs, paddle.base.framework.Variable)
@@ -595,12 +595,12 @@ def _grad(ys, xs, v=None):
 def _separate(xs):
     """
     ``_separate`` separates ``xs`` from the computation graph through ``clone``
-    or ``deteach`` .
+    or ``detach`` .
 
-    Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on
+    Internally, ``paddle.grad(xs, ys)`` is stateful API implemented based on
     computional graph, which will reduce gradients along all path from ys to xs.
 
-    However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and
+    However, functional autograd API such as ``vjp``, ``jvp`` is stateless, and
     only compute gradients with a given ``func`` .
 
     For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is:
diff --git a/python/paddle/incubate/distributed/utils/io/save_for_auto.py b/python/paddle/incubate/distributed/utils/io/save_for_auto.py
index 00a1c7feb6c69..4e3658d8ddd10 100644
--- a/python/paddle/incubate/distributed/utils/io/save_for_auto.py
+++ b/python/paddle/incubate/distributed/utils/io/save_for_auto.py
@@ -40,12 +40,12 @@ def save_for_auto_inference(path_prefix, dist_model, cvt2cpu=False):
         MoE not supported till MoE is supported in auto parallel mode.
 
     Args:
-        path_prefix: path prefix to save. If `path_preifx` ends with path sepreator,
+        path_prefix: path prefix to save. If `path_prefix` ends with path separator,
             the path is processed as a directory and parameters will be saved in it,
-            automatically named saved_parameters. Otherwisw, the parameters will be saved with name
-            path_preifx_dist{global_rank}.pdparams and path_preifx_dist{global_rank}.pdattrs.
+            automatically named saved_parameters. Otherwise, the parameters will be saved with name
+            path_prefix_dist{global_rank}.pdparams and path_prefix_dist{global_rank}.pdattrs.
         dist_model: model in distributed model.
-        cvt2cpu: wheather to move parameters to CPU when using sharding stage 3.
+        cvt2cpu: whether to move parameters to CPU when using sharding stage 3.
             The var is invalid if not using sharding stage 3.
 
     Returns:
diff --git a/python/paddle/jit/dy2static/transformers/call_transformer.py b/python/paddle/jit/dy2static/transformers/call_transformer.py
index eaa181d48ce02..9e2c73f4cb7fc 100644
--- a/python/paddle/jit/dy2static/transformers/call_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/call_transformer.py
@@ -69,7 +69,7 @@ def visit_Call(self, node):
 
         func_str = ast_to_source_code(node.func).strip()
 
-        # NOTE(liym27): Don't convert `pad.set_trace` even if the convertion doesn't work finally, because
+        # NOTE(liym27): Don't convert `pad.set_trace` even if the conversion doesn't work finally, because
         # it is clearer to see where it is called from.
         if PDB_SET in func_str:
             return node
diff --git a/python/paddle/jit/dy2static/transformers/ifelse_transformer.py b/python/paddle/jit/dy2static/transformers/ifelse_transformer.py
index 7645c6617769c..fc8ab6cd0bb1a 100644
--- a/python/paddle/jit/dy2static/transformers/ifelse_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/ifelse_transformer.py
@@ -231,7 +231,7 @@ def visit_Assign(self, node):
     def visit_FunctionDef(self, node):
         # NOTE: We skip to visit names of get_args and set_args, because they contains
         # nonlocal statement such as 'nonlocal x, self' where 'self' should not be
-        # parsed as returned value in contron flow.
+        # parsed as returned value in control flow.
         if (
             GET_ARGS_FUNC_PREFIX in node.name
             or SET_ARGS_FUNC_PREFIX in node.name
@@ -343,7 +343,7 @@ def transform_if_else(node, root):
     nonlocal_names = _valid_nonlocal_names(return_name_ids, nonlocal_names)
 
     # TODO(dev): Need a better way to deal this.
-    # LoopTransformer will create some special vars, which is not visiable by users. so we can sure it's safe to remove them.
+    # LoopTransformer will create some special vars, which is not visible by users. so we can sure it's safe to remove them.
     filter_names = [
         ARGS_NAME,
         FOR_ITER_INDEX_PREFIX,
diff --git a/python/paddle/jit/dy2static/transformers/utils.py b/python/paddle/jit/dy2static/transformers/utils.py
index e74d95e1af9e0..37e5a400e31c4 100644
--- a/python/paddle/jit/dy2static/transformers/utils.py
+++ b/python/paddle/jit/dy2static/transformers/utils.py
@@ -499,7 +499,7 @@ def pre_func():
 
         def post_func():
             """NOTE: why we need merge w_vars and push_pop_vars here ?
-            because we do ifelse_transformer after loop_transformer. Loops will changed into functioons. but we know this function will be called in if. so we add w_vars to father function scope.
+            because we do ifelse_transformer after loop_transformer. Loops will changed into functions. but we know this function will be called in if. so we add w_vars to father function scope.
             """
             control_flow_function_def = [
                 WHILE_BODY_PREFIX,
diff --git a/python/paddle/jit/sot/symbolic/interpreter.py b/python/paddle/jit/sot/symbolic/interpreter.py
index 3179a4c518f82..6b60a2bbbb5fe 100644
--- a/python/paddle/jit/sot/symbolic/interpreter.py
+++ b/python/paddle/jit/sot/symbolic/interpreter.py
@@ -187,7 +187,7 @@ def wrapper(args):
 def prepare_state(SIR, inputs):
     state = {}
 
-    # update free vars if exsits
+    # update free vars if exists
     if SIRRuntimeCache().has_key(SIR.name):
         free_var_seeker = SIRRuntimeCache().get_free_vars(SIR.name)
         if free_var_seeker:
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 7a76f35b9589c..2e5c988ab0c8e 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -148,7 +148,7 @@ def grid_sample(
     indexing the 5th dimension (in width dimension) of input data x, y is
     indexing the 4th dimension (in height dimension) and z is indexing the
     3rd dimension (in depth dimension) finally results is the bilinear
-    interpolation or nearest value of 8 nearest cornerpoints. The output
+    interpolation or nearest value of 8 nearest corner points. The output
     tensor shape will be [N, C, D, H, W].
 
 
diff --git a/python/paddle/nn/initializer/Bilinear.py b/python/paddle/nn/initializer/Bilinear.py
index 1da82cbeee970..05ac3641caff2 100644
--- a/python/paddle/nn/initializer/Bilinear.py
+++ b/python/paddle/nn/initializer/Bilinear.py
@@ -120,7 +120,7 @@ def forward(self, var, block=None):
             weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
         weight = np.reshape(weight, shape)
 
-        # to be compatible of fp16 initalizers
+        # to be compatible of fp16 initializers
         if var.dtype in [
             core.VarDesc.VarType.FP16,
             core.VarDesc.VarType.BF16,
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index 3988f9f14859d..0d04cbbb78398 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -66,7 +66,7 @@ def forward(self, var, block=None):
         )
         assert isinstance(block, (framework.Block, paddle.pir.Block))
 
-        # to be compatible of fp16 initalizers
+        # to be compatible of fp16 initializers
         if var.dtype in [core.VarDesc.VarType.FP16, core.VarDesc.VarType.BF16]:
             out_dtype = core.VarDesc.VarType.FP32
             np_value = self._value.astype("float32")
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index 39329acaf7da1..efb1fc486d059 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -52,7 +52,7 @@ class MSRAInitializer(Initializer):
 
     Args:
         uniform (bool, optional): whether to use uniform or normal distribution. Default is True.
-        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automatically. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
         seed (int32, optional): random seed. Default is 0.
         negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
         nonlinearity(str, optional): the non-linear function. Default is relu.
@@ -107,7 +107,7 @@ def forward(self, var, block=None):
         if self._seed == 0:
             self._seed = block.program.random_seed
 
-        # to be compatible of fp16 initalizers
+        # to be compatible of fp16 initializers
         if var.dtype == core.VarDesc.VarType.FP16 or (
             var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
         ):
@@ -252,7 +252,7 @@ class KaimingNormal(MSRAInitializer):
         \frac{gain}{\sqrt{{fan\_in}}}
 
     Args:
-        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automaticly. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
+        fan_in (float32|None, optional): fan_in (in_features) of trainable Tensor, If None, it will be infered automatically. If you don't want to use in_features of the Tensor, you can set the value of 'fan_in' smartly by yourself. Default is None.
         negative_slope (float, optional): negative_slope (only used with leaky_relu). Default is 0.0.
         nonlinearity(str, optional): the non-linear function. Default is relu.
 
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 4ca0a0902246c..77ecd855b0556 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -200,7 +200,7 @@ def forward(self, var, block=None):
         if self._seed == 0:
             self._seed = block.program.random_seed
 
-        # to be compatible of fp16 initalizers
+        # to be compatible of fp16 initializers
         if var.dtype in [core.VarDesc.VarType.FP16, core.VarDesc.VarType.BF16]:
             out_dtype = core.VarDesc.VarType.FP32
             out_var = block.create_var(
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index e455ca455cd00..fd47805c22133 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -110,7 +110,7 @@ def forward(self, var, block=None):
             if (isinstance(var, framework.EagerParamBase) and var.is_dist())
             else var.shape
         )
-        # to be compatible of fp16 initalizers
+        # to be compatible of fp16 initializers
         if var.dtype == core.VarDesc.VarType.FP16 or (
             var.dtype == core.VarDesc.VarType.BF16 and not self._uniform
         ):
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 60e3a95a20d18..59a9436dadb51 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -134,8 +134,8 @@ class GLU(Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-        - input: Tensor which the size of the given aixs is even.
-        - output: Tensor which the size of the given aixs is halved.
+        - input: Tensor which the size of the given axis is even.
+        - output: Tensor which the size of the given axis is halved.
 
     Examples:
         .. code-block:: python
@@ -799,7 +799,7 @@ def extra_repr(self):
 
 class Sigmoid(Layer):
     r"""
-    this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
+    this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calculate the `sigmoid` of input x.
 
     .. math::
 
@@ -842,8 +842,8 @@ def extra_repr(self):
 
 class Hardsigmoid(Layer):
     r"""
-    ``Hardsigmoid`` Activiation Layers, Construct a callable object of
-    the ``Hardsigmoid`` class. This layer calcluate the `hardsigmoid` of input x.
+    ``Hardsigmoid`` Activation Layers, Construct a callable object of
+    the ``Hardsigmoid`` class. This layer calculate the `hardsigmoid` of input x.
 
     A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
     which is much faster than sigmoid.
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 310e06a35a557..1fd2501698c2f 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -68,7 +68,7 @@ class BCEWithLogitsLoss(Layer):
             batch element. If given, it has to be a 1D Tensor whose size is `[N, ]`,
             The data type is float32, float64. Default is ``'None'``.
         reduction (str, optional): Indicate how to average the loss by batch_size,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
@@ -253,7 +253,7 @@ class CrossEntropyLoss(Layer):
             value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
         reduction (str, optional): Indicate how to average the loss by batch_size,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
@@ -366,15 +366,15 @@ class CrossEntropyLoss(Layer):
             >>> reduction='mean'
             >>> weight = None
             >>> logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
-            >>> interger_labels = paddle.randint(low=0, high=C, shape=[N], dtype='int64')
-            >>> one_hot_labels = paddle.nn.functional.one_hot(interger_labels, C).astype('float32')
+            >>> integer_labels = paddle.randint(low=0, high=C, shape=[N], dtype='int64')
+            >>> one_hot_labels = paddle.nn.functional.one_hot(integer_labels, C).astype('float32')
 
             >>> cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
             ...     weight=weight, reduction=reduction, label_smoothing=label_smoothing)
 
             >>> # integer labels
-            >>> interger_label_dy_ret = cross_entropy_loss(logits, interger_labels)
-            >>> print(interger_label_dy_ret)
+            >>> integer_label_dy_ret = cross_entropy_loss(logits, integer_labels)
+            >>> print(integer_label_dy_ret)
             Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
             1.10520368)
 
@@ -669,7 +669,7 @@ class L1Loss(Layer):
 
     Parameters:
         reduction (str, optional): Indicate the reduction to apply to the loss,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If `reduction` is ``'none'``, the unreduced loss is returned;
             If `reduction` is ``'mean'``, the reduced mean loss is returned.
             If `reduction` is ``'sum'``, the reduced sum loss is returned.
@@ -765,7 +765,7 @@ class BCELoss(Layer):
             batch element. If given, has to be a Tensor of size nbatch and the data type
             is float32, float64. Default is ``'None'``.
         reduction (str, optional): Indicate how to average the loss by batch_size,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
@@ -864,10 +864,10 @@ class NLLLoss(Layer):
         ignore_index (int, optional): Specifies a target value that is ignored
             and does not contribute to the input gradient.
         reduction (str, optional): Indicate how to average the loss,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. Default is ``'mean'``.
+            the candidates are ``'none'`` | ``'mean'`` | ``'sum'``. Default is ``'mean'``.
             If `reduction` is ``'mean'``, the reduced mean loss is returned;
             if `reduction` is ``'sum'``, the reduced sum loss is returned;
-            if `reduction` is ``'none'``, no reduction will be apllied.
+            if `reduction` is ``'none'``, no reduction will be applied.
             Default is ``'mean'``.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default is ``'None'``.
 
@@ -959,10 +959,10 @@ class PoissonNLLLoss(Layer):
             A small value to avoid evaluation of :math:`\log(0)` when ``log_input`` = ``False``. ``epsilon > 0``.
             Default: 1e-8.
          reduction (str, optional):
-            Indicate how to reduce the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            Indicate how to reduce the loss, the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If `reduction` is ``'mean'``, the reduced mean loss is returned;
             if `reduction` is ``'sum'``, the reduced sum loss is returned;
-            if `reduction` is ``'none'``, no reduction will be apllied.
+            if `reduction` is ``'none'``, no reduction will be applied.
             Default is ``'mean'``.
          name (str, optional):
             Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -997,7 +997,7 @@ def __init__(
     ):
         if epsilon <= 0:
             raise ValueError(
-                "The value of `epsilon` in PoissonNLLLoss should be positve, but received %f, which is not allowed"
+                "The value of `epsilon` in PoissonNLLLoss should be positive, but received %f, which is not allowed"
                 % epsilon
             )
         if reduction not in ['sum', 'mean', 'none']:
@@ -1048,11 +1048,11 @@ class KLDivLoss(Layer):
 
     Parameters:
         reduction (str, optional): Indicate how to average the loss,
-            the candicates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
+            the candidates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
             If `reduction` is ``'mean'``, the reduced mean loss is returned;
             If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
             if `reduction` is ``'sum'``, the reduced sum loss is returned;
-            if `reduction` is ``'none'``, no reduction will be apllied.
+            if `reduction` is ``'none'``, no reduction will be applied.
             Default is ``'mean'``.
 
     Shape:
@@ -1132,7 +1132,7 @@ class MarginRankingLoss(Layer):
 
     Parameters:
         margin (float, optional): The margin value to add, default value is 0;
-        reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
+        reduction (str, optional): Indicate the reduction to apply to the loss, the candidates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
@@ -1188,11 +1188,11 @@ class CTCLoss(Layer):
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
     to compute Connectionist Temporal Classification (CTC) loss.
     It can be aliased as softmax with CTC, since a native softmax activation
-    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+    is integrated to the Warp-CTC library to normalize values for each row of the input tensor.
 
     Parameters:
         blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
-        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+        reduction (string, optional): Indicate how to average the loss, the candidates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
 
     Shape:
         - log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type should be float32 or float64.
@@ -1364,7 +1364,7 @@ class SmoothL1Loss(Layer):
 
     Parameters:
         reduction (str, optional): Indicate how to average the loss by batch_size,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
@@ -1437,7 +1437,7 @@ class MultiLabelSoftMarginLoss(Layer):
                     If given, has to be a Tensor of size C and the data type is float32, float64.
                     Default is ``'None'`` .
             reduction (str, optional): Indicate how to average the loss by batch_size,
-                    the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                    the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
                     If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
                     If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
                     If :attr:`reduction` is ``'sum'``, the summed loss is returned.
@@ -1531,7 +1531,7 @@ class HingeEmbeddingLoss(Layer):
             hinge_embedding_loss. When label is -1, Input smaller than margin are minimized with hinge_embedding_loss.
             Default = 1.0
         reduction (str, optional): Indicate how to average the loss by batch_size,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`reduction` is ``'sum'``, the summed loss is returned.
@@ -1733,7 +1733,7 @@ class TripletMarginWithDistanceLoss(Layer):
                 and negative samples) if swap distance smaller than negative distance. Default: ``False``.
 
         reduction (str, Optional):Indicate how to average the loss by batch_size.
-                the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
                 If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
                 If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
                 If :attr:`reduction` is ``'sum'``, the summed loss is returned.
@@ -1845,7 +1845,7 @@ class TripletMarginLoss(Layer):
             Default: ``False``.
 
         reduction (str, Optional):Indicate how to average the loss by batch_size.
-                the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                the candidates are ``'none'`` | ``'mean'`` | ``'sum'``.
                 If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
                 If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
                 If :attr:`reduction` is ``'sum'``, the summed loss is returned.
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 5fc885e981c5b..ff64b4dfd3de8 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -57,7 +57,7 @@ class _InstanceNormBase(Layer):
     """
     This class is based class for InstanceNorm1D, 2d, 3d.
 
-    See InstaceNorm1D, InstanceNorm2D or InstanceNorm3D for more details.
+    See InstanceNorm1D, InstanceNorm2D or InstanceNorm3D for more details.
     """
 
     def __init__(
@@ -779,7 +779,7 @@ def __init__(
         )
         self._variance.stop_gradient = True
 
-        # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+        # TODO(qili93): temporary for ascend npu performance to be removed along with npu_identity op
         if (
             _global_flags()['FLAGS_npu_storage_format']
             and 'npu' in get_all_custom_device_type()
@@ -1018,7 +1018,7 @@ def __init__(
         )
         self._variance.stop_gradient = True
 
-        # TODO(qili93): temporary for ascned npu performance to be removed along with npu_identity op
+        # TODO(qili93): temporary for ascend npu performance to be removed along with npu_identity op
         if (
             _global_flags()['FLAGS_npu_storage_format']
             and 'npu' in get_all_custom_device_type()
@@ -1157,7 +1157,7 @@ def forward(self, input):
 
 class BatchNorm1D(_BatchNormBase):
     r"""
-    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+    Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When use_global_stats = False, the :math:`\mu_{\beta}`
     and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
@@ -1274,7 +1274,7 @@ def _check_input_dim(self, input):
 
 class BatchNorm2D(_BatchNormBase):
     r"""
-    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When use_global_stats = False, the :math:`\mu_{\beta}`
     and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
@@ -1365,7 +1365,7 @@ def _check_input_dim(self, input):
 
 class BatchNorm3D(_BatchNormBase):
     r"""
-    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+    Applies Batch Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
     When use_global_stats = False, the :math:`\mu_{\beta}`
     and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
@@ -1539,7 +1539,7 @@ class SyncBatchNorm(_BatchNormBase):
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
-             of this layer. If it is set to None or one attribute of ParamAttr, this layerr
+             of this layer. If it is set to None or one attribute of ParamAttr, this layer
              will create ParamAttr as param_attr. If the Initializer of the param_attr
              is not set, the parameter is initialized with ones. If it is set to False,
              this layer will not have trainable scale parameter. Default: None.
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 38fee16aad1b3..23eaf467d916d 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -1143,7 +1143,7 @@ def extra_repr(self):
 
 class MaxUnPool1D(Layer):
     r"""
-    This API implements max unpooling 1d opereation.
+    This API implements max unpooling 1d operation.
 
     `max_unpool1d` accepts the output of `max_pool1d` as input,
     including the indices of the maximum value and calculate the partial inverse.
@@ -1231,7 +1231,7 @@ def extra_repr(self):
 
 class MaxUnPool2D(Layer):
     r"""
-    This API implements max unpooling 2d opereation.
+    This API implements max unpooling 2d operation.
 
     'max_unpool2d' accepts the output of 'max_unpool2d' as input
     Including the indices of the maximum value and calculating the partial inverse
@@ -1323,7 +1323,7 @@ def extra_repr(self):
 
 class MaxUnPool3D(Layer):
     r"""
-    This API implements max unpooling 3d opereation.
+    This API implements max unpooling 3d operation.
 
     `max_unpool3d` accepts the output of `max_pool3d` as input,
     including the indices of the maximum value and calculate the partial inverse.
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 7e35344206c1a..61c44b9ea19b5 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -855,7 +855,7 @@ def summary(
         views=None,
     ):
         r"""
-        Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
+        Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and user-defined summary.
 
         Args:
             sorted_by( :ref:`SortedKeys <api_paddle_profiler_SortedKeys>` , optional): how to rank the op table items, default value is SortedKeys.CPUTotal.
diff --git a/python/paddle/quantization/imperative/fuse_utils.py b/python/paddle/quantization/imperative/fuse_utils.py
index 0f8ad443d43ca..f31a70297893e 100644
--- a/python/paddle/quantization/imperative/fuse_utils.py
+++ b/python/paddle/quantization/imperative/fuse_utils.py
@@ -92,7 +92,7 @@ def _fuse_layers(model, layers_list):
 
 
 def _fuse_func(layer_list):
-    '''choose the fuser method and fuse layers'''
+    '''choose the fuse method and fuse layers'''
     types = tuple(type(m) for m in layer_list)
     fusion_method = types_to_fusion_method.get(types, None)
     new_layers = [None] * len(layer_list)
diff --git a/python/paddle/quantization/imperative/ptq_registry.py b/python/paddle/quantization/imperative/ptq_registry.py
index 52e4c487e342f..a865bc5d912f8 100644
--- a/python/paddle/quantization/imperative/ptq_registry.py
+++ b/python/paddle/quantization/imperative/ptq_registry.py
@@ -17,7 +17,7 @@
 
 class LayerInfo:
     """
-    Store the argnames of the inputs and outputs.
+    Store the arg names of the inputs and outputs.
     """
 
     def __init__(self, layer, input_names, weight_names, output_names):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index e6c8cef4f905c..2d2d9375f4a09 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -6758,7 +6758,7 @@ def slice_scatter(x, value, axes, starts, ends, strides, name=None):
         axes (list|tuple) : the dimensions to insert the value.
         starts (list|tuple) : the start indices of where to insert.
         ends (list|tuple) : the stop indices of where to insert.
-        strids (list|tuple) : the steps for each insert.
+        strides (list|tuple) : the steps for each insert.
         name (str, optional): Name for the operation (optional, default is None).
 
     Returns:

From a845436ef3e753489bf39164c3c49203c4ea1a89 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 23 Feb 2024 15:34:38 +0800
Subject: [PATCH 34/82] fix build of generate_shape_op (#61993)

---
 paddle/cinn/hlir/dialect/operator/ir/manual_op.cc | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
index 42fdf23664a8d..54299cc2ff7ff 100644
--- a/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/manual_op.cc
@@ -267,13 +267,12 @@ void GenerateShapeOp::Build(
     const std::vector<pir::Value>& inputs,
     const std::vector<pir::Attribute>& output_dim_exprs,
     const GenerateShapeOp::SymbolBindings& symbol_bindings) {
-  CHECK(!inputs.empty()) << ". output_dim_exprs: " << [&] {
-    std::stringstream ss;
+  if (inputs.empty()) {
+    VLOG(3) << "GenerateShapeOp inputs is empty";
     for (const auto& attr : output_dim_exprs) {
-      ss << attr;
+      CHECK(attr.isa<pir::Int64Attribute>());
     }
-    return ss.str();
-  }();
+  }
   argument.AddInputs(inputs);
   argument.AddAttribute("output_dim_exprs",
                         builder.array_attr(output_dim_exprs));

From 7aca52c1e5ec2e0b5806bdf0d67969b38c701ddb Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Fri, 23 Feb 2024 16:16:09 +0800
Subject: [PATCH 35/82] fix,test=document_fix (#62018)

---
 cmake/third_party.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 6c715e8cf7510..17c428660b223 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -46,7 +46,7 @@ if(NOT WITH_SETUP_INSTALL)
   endif()
 
   execute_process(
-    COMMAND git submodule update --init --recursive --force
+    COMMAND git submodule update --init --recursive
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     RESULT_VARIABLE result_var)
   if(NOT result_var EQUAL 0)

From 934afd220ead2818c60a77dd4c2742e821548e82 Mon Sep 17 00:00:00 2001
From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com>
Date: Fri, 23 Feb 2024 16:24:40 +0800
Subject: [PATCH 36/82] [PIR][DynamicShape] Add InferSymbolicShape interface
 for WhileOp (#61939)

* add while infer

* yield

* bug fix
---
 paddle/fluid/pir/dialect/CMakeLists.txt       |   3 +-
 .../infer_symbolic_shape/cinn_op_infer_sym.cc | 117 ++++++++++--------
 .../infer_symbolic_shape/cinn_op_infer_sym.h  |  11 +-
 .../infer_symbolic_shape/infer_sym_utils.cc   |  21 ++++
 .../infer_symbolic_shape/infer_sym_utils.h    |  11 ++
 .../paddle_op_infer_sym.cc                    |  97 ++++++++-------
 .../same_operands_and_result.h                |   4 +
 .../dialect/operator/ir/control_flow_op.cc    | 112 +++++++++++++++++
 .../pir/dialect/operator/ir/control_flow_op.h |   9 +-
 .../pir/transforms/shape_optimization_pass.cc |  51 ++++----
 .../pir/transforms/shape_optimization_pass.h  |   3 +
 paddle/pir/CMakeLists.txt                     |   7 +-
 .../include/dialect/control_flow/ir/cf_op.h   |   7 +-
 .../pir/src/dialect/control_flow/ir/cf_op.cc  |   7 ++
 .../shape_dialect/shape_optimization_test.cc  |  23 ++--
 test/ir/pir/cinn/symbolic/test_if_else_dy.py  |  92 ++++++++++++++
 test/ir/pir/cinn/symbolic/test_while_dy.py    |  10 +-
 17 files changed, 443 insertions(+), 142 deletions(-)
 create mode 100644 test/ir/pir/cinn/symbolic/test_if_else_dy.py

diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
index ff3e03d3963dd..535f8cdc7c818 100644
--- a/paddle/fluid/pir/dialect/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -246,7 +246,8 @@ set(op_dialect_srcs
     ${pir_op_source_file}
     ${pir_bwd_op_source_file}
     ${pir_update_op_source_file}
-    ${api_source_file})
+    ${api_source_file}
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/transforms/shape_optimization_pass.cc)
 
 if(WITH_MKLDNN)
   set(op_dialect_srcs ${op_dialect_srcs} ${onednn_op_source_file}
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
index ecb56292e170a..0e8240434e070 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.cc
@@ -36,74 +36,32 @@ bool BroadcastOpInferSymbolicShape(
   return true;
 }
 
-bool SliceOpInferSymbolicShape(pir::Operation *op,
-                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  // TODO(zhangbopd): Not implemented yet, different from the one in paddle
-  // dialect. And Currently only support start/end/axis with single value.
-  pir::AttributeMap attributes = op->attributes();
-
-  auto GetAttrInt64Value = [&](const std::string &name) -> int64_t {
-    std::vector<pir::Attribute> attr =
-        attributes[name].dyn_cast<pir::ArrayAttribute>().AsVector();
-    PADDLE_ENFORCE_GT(
-        attr.size(),
-        0,
-        phi::errors::PreconditionNotMet(
-            "Only Support [%s] op len(%s) == 1 , but received %d.",
-            op->name(),
-            name,
-            attr.size()));
-    return attr[0].dyn_cast<pir::Int64Attribute>().data();
-  };
-
-  const int64_t start = GetAttrInt64Value("starts");
-  const int64_t end = GetAttrInt64Value("ends");
-  const int64_t axis = GetAttrInt64Value("axes");
-
-  const pir::Value operand_source = op->operand_source(0);
-  const auto &operand_shape_or_data =
-      shape_analysis->GetShapeOrDataForValue(operand_source);
-
-  const auto GetOutDimExprs = [&]() -> symbol::TensorShapeOrDataDimExprs {
-    std::vector<symbol::DimExpr> out_sym_shape = operand_shape_or_data.shape();
-    if (end == std::numeric_limits<int>::max()) {
-      out_sym_shape[axis] = out_sym_shape[axis] - start;
-    } else {
-      out_sym_shape[axis] = end - start;
-    }
-    symbol::TensorShapeOrDataDimExprs shape_dim_expr(out_sym_shape);
-    if (operand_shape_or_data.data().has_value()) {
-      std::vector<symbol::DimExpr> out_data;
-      for (int64_t i = start; i < end; i++) {
-        out_data.push_back(operand_shape_or_data.data().value()[i]);
-      }
-      shape_dim_expr.SetData(out_data);
-    }
-    return shape_dim_expr;
-  };
-  symbol::ShapeOrDataDimExprs shape_data{GetOutDimExprs()};
-
-  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
-  return true;
-}
-
 bool ConcatOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   const auto input_values = op->operands_source();
   const auto input_size = input_values.size();
 
-  const int axis =
-      op->attributes().at("axis").dyn_cast<pir::Int32Attribute>().data();
+  int axis = op->attributes().at("axis").dyn_cast<pir::Int32Attribute>().data();
 
-  // TODO(zhangbopd): Need support GetShapeOrDataForValue().data() case.
   const auto &GetOutDimExprs = [&]() -> std::vector<symbol::DimExpr> {
     std::vector<symbol::DimExpr> out_dims =
         shape_analysis->GetShapeOrDataForValue(input_values[0]).shape();
+
+    size_t rank = out_dims.size();
+    axis = axis >= 0 ? axis : std::max(int64_t(0), int64_t(axis + rank));
+
     for (size_t i = 1; i < input_size; ++i) {
       const auto &operand_shape_or_data =
           shape_analysis->GetShapeOrDataForValue(input_values[i]);
       out_dims[axis] = out_dims[axis] + operand_shape_or_data.shape()[axis];
     }
+
+    for (size_t i = 1; i < rank; ++i) {
+      if (i == static_cast<size_t>(axis)) continue;
+      paddle::dialect::details::BuildCstrEqForTensorListAlongAxis(
+          shape_analysis, input_values, i);
+    }
+
     return out_dims;
   };
 
@@ -164,4 +122,55 @@ bool ReshapeOpInferSymbolicShape(
   return true;
 }
 
+bool SliceOpInferSymbolicShape(pir::Operation *op,
+                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  // TODO(zhangbopd): Not implemented yet, different from the one in paddle
+  // dialect. And Currently only support start/end/axis with single value.
+  pir::AttributeMap attributes = op->attributes();
+
+  auto GetAttrInt64Value = [&](const std::string &name) -> int64_t {
+    std::vector<pir::Attribute> attr =
+        attributes[name].dyn_cast<pir::ArrayAttribute>().AsVector();
+    PADDLE_ENFORCE_GT(
+        attr.size(),
+        0,
+        phi::errors::PreconditionNotMet(
+            "Only Support [%s] op len(%s) == 1 , but received %d.",
+            op->name(),
+            name,
+            attr.size()));
+    return attr[0].dyn_cast<pir::Int64Attribute>().data();
+  };
+
+  const int64_t start = GetAttrInt64Value("starts");
+  const int64_t end = GetAttrInt64Value("ends");
+  const int64_t axis = GetAttrInt64Value("axes");
+
+  const pir::Value operand_source = op->operand_source(0);
+  const auto &operand_shape_or_data =
+      shape_analysis->GetShapeOrDataForValue(operand_source);
+
+  const auto GetOutDimExprs = [&]() -> symbol::TensorShapeOrDataDimExprs {
+    std::vector<symbol::DimExpr> out_sym_shape = operand_shape_or_data.shape();
+    if (end == std::numeric_limits<int>::max()) {
+      out_sym_shape[axis] = out_sym_shape[axis] - start;
+    } else {
+      out_sym_shape[axis] = end - start;
+    }
+    symbol::TensorShapeOrDataDimExprs shape_dim_expr(out_sym_shape);
+    if (operand_shape_or_data.data().has_value()) {
+      std::vector<symbol::DimExpr> out_data;
+      for (int64_t i = start; i < end; i++) {
+        out_data.push_back(operand_shape_or_data.data().value()[i]);
+      }
+      shape_dim_expr.SetData(out_data);
+    }
+    return shape_dim_expr;
+  };
+  symbol::ShapeOrDataDimExprs shape_data{GetOutDimExprs()};
+
+  shape_analysis->SetShapeOrDataForValue(op->result(0), shape_data);
+  return true;
+}
+
 }  // namespace cinn::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
index 896dd44d0b12b..b98f8e02d66e9 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/cinn_op_infer_sym.h
@@ -16,23 +16,32 @@
 #include "paddle/pir/include/dialect/shape/utils/shape_analysis.h"
 
 namespace cinn::dialect {
-// using paddle::dialect::ScaleOpInferSymbolicShape;
 
 bool BroadcastOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool SliceOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool ConcatOpInferSymbolicShape(pir::Operation *op,
                                 pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool ReduceMaxOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool ReduceMinOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool ReduceProdOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool ReduceSumOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 bool ReshapeOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis);
 
+bool SliceOpInferSymbolicShape(pir::Operation *op,
+                               pir::ShapeConstraintIRAnalysis *shape_analysis);
+
 }  // namespace cinn::dialect
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
index 6337d125b5610..4e5f5df08732a 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.cc
@@ -78,4 +78,25 @@ bool ReduceInferDim(pir::Operation *op,
   return true;
 }
 
+void BuildCstrEqForTensorListAlongAxis(
+    pir::ShapeConstraintIRAnalysis *shape_analysis,
+    const symbol::TensorListShapeOrDataDimExprs &shape_data_list,
+    int axis) {
+  for (size_t i = 1; i < shape_data_list.size(); ++i) {
+    shape_analysis->CreateDimExprBuilder().CstrEq(
+        shape_data_list[0].shape()[axis], shape_data_list[i].shape()[axis]);
+  }
+}
+
+void BuildCstrEqForTensorListAlongAxis(
+    pir::ShapeConstraintIRAnalysis *shape_analysis,
+    const std::vector<pir::Value> &values,
+    int axis) {
+  for (size_t i = 1; i < values.size(); ++i) {
+    shape_analysis->CreateDimExprBuilder().CstrEq(
+        shape_analysis->GetShapeOrDataForValue(values[0]).shape()[axis],
+        shape_analysis->GetShapeOrDataForValue(values[i]).shape()[axis]);
+  }
+}
+
 }  // namespace paddle::dialect::details
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
index 513636344ea2b..8a14e40e6337a 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h
@@ -65,4 +65,15 @@ bool ReduceInferDim(pir::Operation *op,
                     const std::vector<int64_t> &axis,
                     bool keep_dim,
                     bool reduce_all);
+
+void BuildCstrEqForTensorListAlongAxis(
+    pir::ShapeConstraintIRAnalysis *shape_analysis,
+    const symbol::TensorListShapeOrDataDimExprs &shape_data_list,
+    int axis);
+
+void BuildCstrEqForTensorListAlongAxis(
+    pir::ShapeConstraintIRAnalysis *shape_analysis,
+    const std::vector<pir::Value> &values,
+    int axis);
+
 }  // namespace paddle::dialect::details
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index 092ecc89cb13f..0b1dff55f4c41 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -80,16 +80,6 @@ bool ShapeSrOpInferSymbolicShape(
   return ShapeOpInferSymbolicShape(op, shape_analysis);
 }
 
-void BuildCstrEqForTensorListAlongAxis(
-    pir::ShapeConstraintIRAnalysis *shape_analysis,
-    const symbol::TensorListShapeOrDataDimExprs &shape_data_list,
-    int axis) {
-  for (size_t i = 1; i < shape_data_list.size(); ++i) {
-    shape_analysis->CreateDimExprBuilder().CstrEq(
-        shape_data_list[0].shape()[axis], shape_data_list[i].shape()[axis]);
-  }
-}
-
 bool StackOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
   pir::Value operand_source = op->operand_source(0);
@@ -119,7 +109,8 @@ bool StackOpInferSymbolicShape(pir::Operation *op,
     } else {
       for (int i = 0; i < rank; ++i) {
         if (i == axis) continue;
-        BuildCstrEqForTensorListAlongAxis(shape_analysis, shape_data_list, i);
+        details::BuildCstrEqForTensorListAlongAxis(
+            shape_analysis, shape_data_list, i);
       }
       shape_dim_exprs.insert(shape_dim_exprs.begin() + axis,
                              static_cast<std::int64_t>(shape_data_list.size()));
@@ -194,31 +185,42 @@ bool ReshapeOpInferSymbolicShape(
   const symbol::ShapeOrDataDimExprs &operand_shape_or_data =
       shape_analysis->GetShapeOrDataForValue(operand_source_shape);
 
-  const std::vector<symbol::DimExpr> out_dims = [&] {
-    std::vector<symbol::DimExpr> out_dims;
-    out_dims = operand_shape_or_data.data().value();
+  const auto &GetProduct = [&](const auto &dim_exprs, const auto &Filter) {
+    symbol::DimExpr product{1};
+    for (const auto &dim_expr : dim_exprs) {
+      if (Filter(dim_expr)) {
+        product = product * dim_expr;
+      }
+    }
+    return product;
+  };
 
-    symbol::DimExpr product = symbol::DimExpr(1);
-    symbol::DimExpr numel = symbol::DimExpr(1);
+  const auto &IsNotMinusOne = [&](const symbol::DimExpr &dim_expr) {
+    if (dim_expr.isa<int64_t>()) {
+      return dim_expr.dyn_cast<int64_t>() != static_cast<int64_t>(-1);
+    }
+    return true;
+  };
 
+  const std::vector<symbol::DimExpr> out_dims = [&] {
     const auto &original_shape =
         shape_analysis->GetShapeOrDataForValue(op->operand_source(0)).shape();
-    for (auto &dim_expr : original_shape) {
-      numel = numel * dim_expr;
-    }
 
-    for (size_t i = 0; i < out_dims.size(); i++) {
-      if (out_dims[i].isa<int64_t>()) {
-        if (out_dims[i].dyn_cast<int64_t>() != static_cast<int64_t>(-1)) {
-          product = product * out_dims[i];
-        } else if (i == out_dims.size() - 1) {
-          out_dims[i] = numel / product;
-        } else {
-          // doing nothing
-        }
-      } else {
-        product = product * out_dims[i];
-      }
+    const auto &numel =
+        GetProduct(original_shape, [](const auto &) { return true; });
+
+    const auto &product_exclude_minus_one =
+        GetProduct(operand_shape_or_data.data().value(), IsNotMinusOne);
+
+    const auto &input_dims = operand_shape_or_data.data().value();
+
+    std::vector<symbol::DimExpr> out_dims;
+    out_dims.reserve(input_dims.size());
+    for (const auto &dim_expr : input_dims) {
+      const auto &out_dim_expr = IsNotMinusOne(dim_expr)
+                                     ? dim_expr
+                                     : (numel / product_exclude_minus_one);
+      out_dims.emplace_back(out_dim_expr);
     }
 
     return out_dims;
@@ -352,15 +354,20 @@ bool SliceOpInferSymbolicShape(pir::Operation *op,
       int64_t axis = axes[i];
       auto end =
           IsMaxInt(dim_expr_ends[i]) ? out_shape[axis] : dim_expr_ends[i];
-      if ((starts[i] >= 0 && ends[i] >= 0) ||
-          (starts[i] <= 0 && ends[i] <= 0)) {  // both negtive or positive.
+
+      bool both_negative_or_positive =
+          (starts[i] >= 0 && ends[i] >= 0) || (starts[i] <= 0 && ends[i] <= 0);
+      bool start_negative_end_positive = starts[i] <= 0 && ends[i] >= 0;
+      bool start_positive_end_negative = starts[i] >= 0 && ends[i] <= 0;
+
+      if (both_negative_or_positive) {
         out_shape[axis] = end - dim_expr_starts[i];
-      } else if (starts[i] <= 0 &&
-                 ends[i] >= 0) {  // negtive start, positive end
+      } else if (start_negative_end_positive) {
         out_shape[axis] = end - dim_expr_starts[i] - out_shape[axis];
-      } else if (starts[i] >= 0 &&
-                 ends[i] <= 0) {  // positive start, negtive end
+      } else if (start_positive_end_negative) {
         out_shape[axis] = out_shape[axis] - dim_expr_starts[i] + end;
+      } else {
+        LOG(FATAL) << "Dead code";
       }
     }
 
@@ -429,14 +436,14 @@ bool ConcatOpInferSymbolicShape(
 
   const std::vector<symbol::DimExpr> &out_dims = [&] {
     std::vector<symbol::DimExpr> out_dims = shape_data_list[0].shape();
-    for (size_t i = 1; i < shape_data_list.size(); ++i) {
-      for (size_t j = 0; j < rank; ++j) {
-        if (j != static_cast<size_t>(axis)) {
-          // This func have bug
-          BuildCstrEqForTensorListAlongAxis(shape_analysis, shape_data_list, i);
-          continue;
-        }
-        out_dims[axis] = out_dims[axis] + shape_data_list[i].shape()[axis];
+    for (size_t i = 0; i < rank; ++i) {
+      if (i != static_cast<size_t>(axis)) {
+        details::BuildCstrEqForTensorListAlongAxis(
+            shape_analysis, shape_data_list, i);
+        continue;
+      }
+      for (size_t j = 1; j < shape_data_list.size(); ++j) {
+        out_dims[axis] = out_dims[axis] + shape_data_list[j].shape()[axis];
       }
     }
     return out_dims;
diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
index 706bc500048b5..b72111b1173d5 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/same_operands_and_result.h
@@ -95,3 +95,7 @@ bool Tril_OpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis);
 
 }  // namespace paddle::dialect
+
+namespace cinn::dialect {
+using paddle::dialect::ScaleOpInferSymbolicShape;
+}
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
index f1fcb7c7b75bb..7f3929d0b9967 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -24,6 +24,7 @@ paddle::dialect::IfOp, paddle::dialect::WhileOp, paddle::dialect::HasElementsOp,
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/transforms/shape_optimization_pass.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/builtin_attribute.h"
@@ -167,6 +168,7 @@ void IfOp::Print(pir::IrPrinter &printer) {
   printer.PrintOpResult(op);
   os << " = pd_op.if";
   printer.PrintOpOperands(op);
+  printer.PrintAttributeMap(op);
   os << " -> ";
   printer.PrintOpReturnType(op);
   os << "{\n";
@@ -306,6 +308,75 @@ std::vector<std::vector<pir::Value>> IfOp::Vjp(
   return res;
 }
 
+bool IfOp::InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  // infer true block
+  pir::InferSymExprForBlock(true_block(), shape_analysis);
+
+  // infer false block
+  pir::InferSymExprForBlock(false_block(), shape_analysis);
+
+  auto GetSymExprForBlockResult =
+      [shape_analysis](const pir::Operation &op,
+                       uint32_t idx) -> const std::vector<symbol::DimExpr> & {
+    const auto &shape_or_data =
+        shape_analysis->GetShapeOrDataForValue(op.operand_source(idx));
+    if (shape_or_data.data().has_value()) {
+      return shape_or_data.data().value();
+    } else {
+      return shape_or_data.shape();
+    }
+  };
+
+  // TODO(lanxianghit): for llama, `if` op's result num always > 0, but
+  // result_num == 0 should be supported in future
+  if (num_results() > 0) {
+    for (uint32_t rst_idx = 0; rst_idx < num_results(); rst_idx++) {
+      const auto &true_dims =
+          GetSymExprForBlockResult(true_block().back(), rst_idx);
+      const auto &false_dims =
+          GetSymExprForBlockResult(false_block().back(), rst_idx);
+
+      // merge shape for true and false block, new symbol will be assigned when
+      // the dims is not equal in true and false block, even if the dims are all
+      // constant, since we don't know which will be returned in compile time
+      // examples:
+      // true_block    false_block    return
+      // [1, 128]       [1, 256]      [1, S0]
+      // [1, S0]        [1, S1]       [1, S2]
+      // [1, S0]        [S1, S2]      [S1, S3]
+      // [1, S0]        [1, S0]       [1, S0]
+
+      std::vector<symbol::DimExpr> out_dims = true_dims;
+      if (false_dims.size() != 0) {
+        // now only support results of true and false block have same rank.
+        PADDLE_ENFORCE_EQ(true_dims.size(),
+                          false_dims.size(),
+                          phi::errors::PreconditionNotMet(
+                              "The true and false block should have same rank, "
+                              "but got true_rank(%d) and false_rank(%d)",
+                              true_dims.size(),
+                              false_dims.size()));
+        for (size_t i = 0; i < true_dims.size(); i++) {
+          if (true_dims[i] != false_dims[i]) {
+            out_dims[i] = symbol::DimExpr{shape_analysis->GetNextSymName()};
+          }
+        }
+      }
+
+      shape_analysis->SetShapeOrDataForValue(
+          result(rst_idx),
+          symbol::ShapeOrDataDimExprs{
+              symbol::TensorShapeOrDataDimExprs(out_dims)});
+    }
+
+    return true;
+  } else {
+    PADDLE_THROW(
+        phi::errors::Unimplemented("IfOp::InferSymbolicShape: now only "
+                                   "support num_results() == 1."));
+  }
+}
+
 void PyLayerOp::Build(pir::Builder &builder,             // NOLINT
                       pir::OperationArgument &argument,  // NOLINT
                       pir::Value combined_inputs,
@@ -649,6 +720,47 @@ std::vector<std::vector<pir::Value>> WhileOp::Vjp(
   }
   return res;
 }
+
+bool WhileOp::InferSymbolicShape(
+    pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  VLOG(3) << "############ WhileOp::InferSymbolicShape start...";
+  pir::Program *body_program = body().parent_program();
+  VLOG(3) << "##### WhileOp::InferSymbolicShape: sub_program id = "
+          << body_program->module_op().operation()->id();
+
+  for (auto &value : block_args()) {
+    std::vector<symbol::DimExpr> sym_dims;
+    const std::vector<int64_t> &dims =
+        common::vectorize(value.type().dyn_cast<pir::DenseTensorType>().dims());
+
+    for (auto dim : dims) {
+      symbol::DimExpr dim_expr;
+      if (dim == pir::ShapedTypeInterface::kDynamic) {
+        symbol::DimExpr symbolic_dim_expr(shape_analysis->GetNextSymName());
+        dim_expr = symbolic_dim_expr;
+      } else {
+        symbol::DimExpr numeric_dim_expr(dim);
+        dim_expr = numeric_dim_expr;
+      }
+      sym_dims.push_back(dim_expr);
+    }
+    symbol::ShapeOrDataDimExprs shape_data{
+        symbol::TensorShapeOrDataDimExprs(sym_dims)};
+    shape_analysis->SetShapeOrDataForValue(value, shape_data);
+  }
+
+  pir::InferSymExprForBlock(body(), shape_analysis);
+
+  const auto &last_op = body().back();
+  for (size_t i = 1; i < last_op.operands_source().size(); ++i) {
+    shape_analysis->SetShapeOrDataForValue(
+        result(i - 1),
+        shape_analysis->GetShapeOrDataForValue(last_op.operand_source(i)));
+  }
+
+  return true;
+}
+
 std::vector<std::vector<pir::Value>> TuplePushOpVjpInterfaceModel::Vjp(
     pir::Operation *op,
     const std::vector<std::vector<pir::Value>> &inputs,
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
index d59fd41f77b40..f8a6bbb9f3b0f 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <vector>
 
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h"
 #include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
 #include "paddle/pir/include/core/block.h"
@@ -23,7 +24,7 @@
 namespace paddle {
 namespace dialect {
 
-class IfOp : public pir::Op<IfOp, VjpInterface> {
+class IfOp : public pir::Op<IfOp, VjpInterface, InferSymbolicShapeInterface> {
  public:
   using Op::Op;
   static const char *name() { return "pd_op.if"; }
@@ -55,6 +56,8 @@ class IfOp : public pir::Op<IfOp, VjpInterface> {
       const std::vector<std::vector<pir::Value>> &outputs,
       const std::vector<std::vector<pir::Value>> &out_grads,
       const std::vector<std::vector<bool>> &stop_gradients);
+
+  bool InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis);
 };
 
 class PyLayerOp : public pir::Op<PyLayerOp> {
@@ -94,7 +97,8 @@ class PyLayerOp : public pir::Op<PyLayerOp> {
 ///      cond, outputs = body(outputs)
 ///   }
 ///
-class WhileOp : public pir::Op<WhileOp, VjpInterface> {
+class WhileOp
+    : public pir::Op<WhileOp, VjpInterface, InferSymbolicShapeInterface> {
  public:
   using Op::Op;
   static const char *name() { return "pd_op.while"; }
@@ -118,6 +122,7 @@ class WhileOp : public pir::Op<WhileOp, VjpInterface> {
       const std::vector<std::vector<pir::Value>> &outputs,
       const std::vector<std::vector<pir::Value>> &out_grads,
       const std::vector<std::vector<bool>> &stop_gradients);
+  bool InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis);
 };
 
 struct TuplePushOpVjpInterfaceModel : public VjpInterface::Concept {
diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.cc b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
index f2cadd7f1b74d..69377af0d30b5 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.cc
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
@@ -85,29 +85,7 @@ void InferSymExprForAllValues(ModuleOp module_op) {
   shape_analysis.Init();
   for (uint32_t i = 0; i < module_op->num_regions(); i++) {
     for (auto& block : module_op->region(i)) {
-      for (auto& op : block) {
-        auto infer_symbolic_shape_interface =
-            op.dyn_cast<paddle::dialect::InferSymbolicShapeInterface>();
-        if (infer_symbolic_shape_interface) {
-          VLOG(vlog_level) << op.name() << " has InferSymbolicShapeInterface.";
-          PADDLE_ENFORCE(infer_symbolic_shape_interface.InferSymbolicShape(
-                             &shape_analysis),
-                         "InferSymbolicShape for %s failed.",
-                         op.name());
-          if (op.num_results() > 0) {
-            // TODO(lanxianghit): deal with the ops which have more than 1
-            // ACTUAL results
-            pir::shape::SetShapeAttrForOp(
-                &op, shape_analysis.GetShapeOrDataForValue(op.result(0)));
-          }
-        } else {
-          VLOG(vlog_level) << op.name() +
-                                  " DOES NOT have InferSymbolicShapeInterface!";
-          PADDLE_THROW(phi::errors::Unimplemented(
-              op.name() + " DOES NOT have InferSymbolicShapeInterface!"));
-        }
-        DebugPrintOpInfo(&op, &shape_analysis);
-      }
+      InferSymExprForBlock(block, &shape_analysis);
     }
   }
 }
@@ -143,6 +121,33 @@ class ShapeOptimizationPass : public pir::Pass {
 
 }  // namespace
 
+void InferSymExprForBlock(const Block& block,
+                          ShapeConstraintIRAnalysis* shape_analysis) {
+  for (auto& op : block) {
+    auto infer_symbolic_shape_interface =
+        op.dyn_cast<paddle::dialect::InferSymbolicShapeInterface>();
+    if (infer_symbolic_shape_interface) {
+      VLOG(vlog_level) << op.name() << " has InferSymbolicShapeInterface.";
+      PADDLE_ENFORCE(
+          infer_symbolic_shape_interface.InferSymbolicShape(shape_analysis),
+          "InferSymbolicShape for %s failed.",
+          op.name());
+      if (op.num_results() > 0) {
+        // TODO(lanxianghit): deal with the ops which have more than 1
+        // ACTUAL results
+        pir::shape::SetShapeAttrForOp(
+            &op, shape_analysis->GetShapeOrDataForValue(op.result(0)));
+      }
+    } else {
+      VLOG(vlog_level) << op.name() +
+                              " DOES NOT have InferSymbolicShapeInterface!";
+      PADDLE_THROW(phi::errors::Unimplemented(
+          op.name() + " DOES NOT have InferSymbolicShapeInterface!"));
+    }
+    DebugPrintOpInfo(&op, shape_analysis);
+  }
+}
+
 std::unique_ptr<Pass> CreateShapeOptimizationPass() {
   return std::make_unique<ShapeOptimizationPass>();
 }
diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.h b/paddle/fluid/pir/transforms/shape_optimization_pass.h
index 64658504bbe97..a23de56f35d6e 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.h
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.h
@@ -24,4 +24,7 @@ class Pass;
 
 IR_API std::unique_ptr<Pass> CreateShapeOptimizationPass();
 
+void InferSymExprForBlock(const Block &block,
+                          ShapeConstraintIRAnalysis *shape_analysis);
+
 }  // namespace pir
diff --git a/paddle/pir/CMakeLists.txt b/paddle/pir/CMakeLists.txt
index c50f7cb3d8eb1..0f9adcb74fc2e 100644
--- a/paddle/pir/CMakeLists.txt
+++ b/paddle/pir/CMakeLists.txt
@@ -1,7 +1,12 @@
 add_definitions(-DIR_LIBRARY)
 set_property(GLOBAL PROPERTY IR_TARGETS "")
 
-file(GLOB_RECURSE PIR_CPP_SOURCES "*.cc")
+file(
+  GLOB_RECURSE
+  PIR_CPP_SOURCES
+  "*.cc"
+  ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.cc
+)
 
 if(WIN32)
   if(WITH_SHARED_IR)
diff --git a/paddle/pir/include/dialect/control_flow/ir/cf_op.h b/paddle/pir/include/dialect/control_flow/ir/cf_op.h
index 0883b8e1727a8..ed3e51df121c4 100644
--- a/paddle/pir/include/dialect/control_flow/ir/cf_op.h
+++ b/paddle/pir/include/dialect/control_flow/ir/cf_op.h
@@ -14,13 +14,16 @@
 
 #pragma once
 #include <functional>
+#include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h"
 #include "paddle/pir/include/core/builder.h"
 #include "paddle/pir/include/core/op_base.h"
 #include "paddle/pir/include/core/op_trait.h"
 #include "paddle/pir/include/dialect/control_flow/ir/cf_interface.h"
 
 namespace pir {
-class IR_API YieldOp : public Op<YieldOp, SideEffectTrait> {
+class IR_API YieldOp : public Op<YieldOp,
+                                 SideEffectTrait,
+                                 paddle::dialect::InferSymbolicShapeInterface> {
  public:
   using Op::Op;
   static const char *name() { return "cf.yield"; }
@@ -31,6 +34,8 @@ class IR_API YieldOp : public Op<YieldOp, SideEffectTrait> {
                     OperationArgument &argument,  // NOLINT
                     const std::vector<Value> &Value);
   void VerifySig() {}
+
+  bool InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis);
 };
 
 ///
diff --git a/paddle/pir/src/dialect/control_flow/ir/cf_op.cc b/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
index bc98389c50a4e..c203fbafb5a02 100644
--- a/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
+++ b/paddle/pir/src/dialect/control_flow/ir/cf_op.cc
@@ -25,6 +25,13 @@ void YieldOp::Build(Builder &builder,
   argument.AddInputs(inputs);
 }
 
+bool YieldOp::InferSymbolicShape(
+    pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  VLOG(3) << "############ YieldOp::InferSymbolicShape start...";
+  // YieldOp has no output, just return true
+  return true;
+}
+
 void TuplePushOp::Build(Builder &builder,             // NOLINT
                         OperationArgument &argument,  // NOLINT
                         Value inlet,
diff --git a/test/cpp/pir/shape_dialect/shape_optimization_test.cc b/test/cpp/pir/shape_dialect/shape_optimization_test.cc
index 7e1e0ff1509dd..b48f84db4d1b8 100644
--- a/test/cpp/pir/shape_dialect/shape_optimization_test.cc
+++ b/test/cpp/pir/shape_dialect/shape_optimization_test.cc
@@ -116,17 +116,20 @@ TEST(shape_optimization, shape_optimization_pass) {
   symbol::ShapeOrDataDimExprs subtract_res =
       shape_analysis.GetShapeOrDataForValue(subtract_op.result(0));
 
-  // TODO(zhangbopd): after shape infer is completed, we can check the results
-  // EXPECT_EQ(cast_res.shape()[0], 1);
-  // EXPECT_EQ(cast_res.shape()[1], 64);
-  // EXPECT_EQ(symbol::ToString(cast_res.shape()[2]) == "Mul(S0, 32)");
-  // EXPECT_EQ(cast_res.shape()[3] == 2);
-  // EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(S2, -4)");
-  // EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(S3, -4)");
-  // EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(S4, -4)");
-  // EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(S5, -4)");
+  EXPECT_EQ(cast_res.shape()[0], 1);
+  EXPECT_EQ(cast_res.shape()[1], 64);
+  EXPECT_EQ(symbol::ToString(cast_res.shape()[2]),
+            "Mul(Mul(Mul(Mul(1, S1), 128), 32), 1 / (128))");
+  EXPECT_EQ(cast_res.shape()[3], 2);
+
+  EXPECT_EQ(symbol::ToString(relu_res.shape()[0]), "Add(Add(S2, -2), -2)");
+  EXPECT_EQ(symbol::ToString(relu_res.shape()[1]), "Add(Add(S3, -2), -2)");
+  EXPECT_EQ(symbol::ToString(relu_res.shape()[2]), "Add(Add(S4, -2), -2)");
+  EXPECT_EQ(symbol::ToString(relu_res.shape()[3]), "Add(Add(S5, -2), -2)");
 
   EXPECT_EQ(subtract_res.shape()[0], 1);
   EXPECT_EQ(subtract_res.shape()[1], 64);
-  EXPECT_EQ(symbol::ToString(subtract_res.shape()[2]), "Broadcast(S0, -1)");
+  EXPECT_EQ(symbol::ToString(subtract_res.shape()[2]),
+            "Broadcast(S0, Mul(Mul(Mul(Mul(1, S1), 128), 32), 1 / (128)))");
+  EXPECT_EQ(subtract_res.shape()[3], 2);
 }
diff --git a/test/ir/pir/cinn/symbolic/test_if_else_dy.py b/test/ir/pir/cinn/symbolic/test_if_else_dy.py
new file mode 100644
index 0000000000000..c8b2276027898
--- /dev/null
+++ b/test/ir/pir/cinn/symbolic/test_if_else_dy.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+from os.path import dirname
+
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.static import InputSpec
+
+sys.path.append(dirname(dirname(__file__)))
+
+import utils
+
+
+class IfSubgraph(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def exp_sub(self, x):
+        y = paddle.exp(x)
+        return y - x
+
+    def forward(self, x, y):
+        if x.shape[-1] > 1:
+            x = self.exp_sub(x)
+        else:
+            y = paddle.abs(y)
+        x = paddle.nn.functional.relu(x)
+        y = paddle.logical_not(y)
+        return x, y
+
+
+class TestIfSubgraph(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        self.prepare_data()
+
+    def prepare_data(self):
+        self.shape = [1, 2048]
+        self.x = paddle.randn(self.shape, dtype="float32")
+        self.x.stop_gradient = False
+
+        self.y_shape = [2, 256]
+        self.y = paddle.randn(self.y_shape, dtype="float32")
+        self.y.stop_gradient = False
+
+    def check_jit_kernel_info(self, static_fn):
+        utils.check_jit_kernel_number(static_fn, 1)
+        utils.check_jit_kernel_structure(static_fn, {utils.JIT_KERNEL_NAME: 1})
+
+    def eval(self, use_cinn):
+        net = IfSubgraph()
+        input_spec = [
+            InputSpec(shape=[None, None], dtype="float32"),
+            InputSpec(shape=[None, None], dtype="float32"),
+        ]
+        net = utils.apply_to_static(net, use_cinn, input_spec)
+        net.eval()
+        out = net(self.x, self.y)
+        if use_cinn:
+            self.check_jit_kernel_info(net.forward)
+        return out
+
+    def test_eval(self):
+        dy_out_x, dy_out_y = self.eval(use_cinn=False)
+        if utils.unittest_use_cinn():
+            cinn_out_x, cinn_out_y = self.eval(use_cinn=True)
+            np.testing.assert_allclose(
+                cinn_out_x.numpy(), dy_out_x.numpy(), atol=1e-6, rtol=1e-6
+            )
+            np.testing.assert_allclose(
+                cinn_out_y.numpy(), dy_out_y.numpy(), atol=1e-6, rtol=1e-6
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/ir/pir/cinn/symbolic/test_while_dy.py b/test/ir/pir/cinn/symbolic/test_while_dy.py
index a8ba57ed39494..627d03ab838c5 100644
--- a/test/ir/pir/cinn/symbolic/test_while_dy.py
+++ b/test/ir/pir/cinn/symbolic/test_while_dy.py
@@ -32,11 +32,13 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x):
-        loop_count = 0
-        while loop_count < 1:
-            y = paddle.exp(x)
-            x = y - x
+        loop_count = paddle.full([1], 0)
+        while x.sum() > paddle.full([1], 0) and loop_count < paddle.full(
+            [1], 1
+        ):
+            x = paddle.exp(x) - x
             loop_count += 1
+        x = paddle.exp(x)
         return x
 
 

From c81ab0593908d6ab385a5bc3753e8dc004ab028a Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 17:27:12 +0800
Subject: [PATCH 37/82]  Update group_sharded_optimizer_stage2.py (#62002)

---
 .../meta_parallel/sharding/group_sharded_optimizer_stage2.py  | 4 ++--
 .../fleet/meta_parallel/sharding/group_sharded_stage3.py      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index 1bb2a712071b5..2e1086c654cd0 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -75,7 +75,7 @@ def __init__(
         group=None,
         offload=False,
         device="gpu",
-        pertrain_sync_models=True,
+        pretrain_sync_models=True,
         dp_group=None,
         **kw
     ):
@@ -178,7 +178,7 @@ def __init__(
             ), "Not support! when using offload with sharding stage2, please use pure sharding stage2, exclude data parallel."
 
         # Synchronous all ranks models
-        if pertrain_sync_models:
+        if pretrain_sync_models:
             self._sync_params_and_buffers()
 
         self.param_storages = {}  # {dtype: {rank: InternalStorage}}
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index 67559a3c7e6ad..628aa9da082f8 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -106,7 +106,7 @@ def __init__(
         sync_buffers=False,
         device="gpu",
         segment_size=2**20,
-        pertrain_sync_models=True,
+        pretrain_sync_models=True,
         offload=False,
         sync_comm=False,
         dp_group=None,
@@ -213,7 +213,7 @@ def __init__(
                         item["grad_clip"] = self._optim._grad_clip
 
         # Synchronous all ranks models
-        if pertrain_sync_models:
+        if pretrain_sync_models:
             self._sync_params_and_buffers()
 
         self._segment_rank_params(self._layer)

From 47b3ff375f9918f410bdd4a236cb1e1a28fb76c1 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Fri, 23 Feb 2024 17:27:33 +0800
Subject: [PATCH 38/82] Update common.py (#62001)

---
 .../distributed/auto_parallel/static/operators/common.py      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/static/operators/common.py b/python/paddle/distributed/auto_parallel/static/operators/common.py
index e66a337e90ec9..9f95b049cce3c 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/common.py
@@ -75,8 +75,8 @@ class SyncMode:
     the synchronization mode for communication or auxiliary operator
     """
 
-    AmpFlagSync = "auto_parallel/amp_flag_synchorization"
-    GlobalNormSync = "auto_parallel/global_norm_synchorization"
+    AmpFlagSync = "auto_parallel/amp_flag_synchronization"
+    GlobalNormSync = "auto_parallel/global_norm_synchronization"
 
 
 def is_elementwise_op(op_type):

From 646edcca85a96277fd464715a6b9f294c657f827 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Fri, 23 Feb 2024 17:36:32 +0800
Subject: [PATCH 39/82] support sharding stage 1, 3 (#61926)

---
 python/paddle/distributed/__init__.py         |   4 +
 .../paddle/distributed/auto_parallel/api.py   | 164 ++++++++++++++++++
 test/auto_parallel/CMakeLists.txt             |   4 +
 .../hybrid_strategy/CMakeLists.txt            |   8 +
 .../semi_auto_parallel_sharding_stage_1.py    |  84 +++++++++
 .../semi_auto_parallel_sharding_stage_3.py    |  84 +++++++++
 ..._auto_parallel_hybrid_sharding_strategy.py |  56 ++++++
 .../hybrid_strategy/testslist.csv             |   1 +
 .../semi_auto_parallel_sharding_stage_1.py    |  75 ++++++++
 .../semi_auto_parallel_sharding_stage_3.py    |  75 ++++++++
 ...st_semi_auto_parallel_sharding_strategy.py |  56 ++++++
 11 files changed, 611 insertions(+)
 create mode 100644 test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_1.py
 create mode 100644 test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_3.py
 create mode 100644 test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_sharding_strategy.py
 create mode 100644 test/auto_parallel/semi_auto_parallel_sharding_stage_1.py
 create mode 100644 test/auto_parallel/semi_auto_parallel_sharding_stage_3.py
 create mode 100644 test/auto_parallel/test_semi_auto_parallel_sharding_strategy.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index b882b3dad144b..feae03521c84b 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -86,6 +86,8 @@
     shard_layer,
     shard_optimizer,
     shard_scaler,
+    ShardingStage1,
+    ShardingStage3,
     to_static,
     Strategy,
     DistModel,
@@ -171,6 +173,8 @@
     "load_state_dict",
     "shard_optimizer",
     "shard_scaler",
+    "ShardingStage1",
+    "ShardingStage3",
     "to_static",
     "Strategy",
     "DistModel",
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index 7eadcd63f2054..73a69c91b74a4 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -551,6 +551,35 @@ def replicate_layer_params_and_buffers(
         )
 
 
+def get_placement_with_sharding(param):
+    shard_axis = -1
+    for placement in param.placements:
+        if isinstance(placement, dist.Shard):
+            # the parameter can't be shard twice on different mesh now
+            # assert here in case
+            assert (
+                shard_axis == -1
+            ), "The parameter can't be shard twich even in different mesh now."
+            shard_axis = placement.get_dim()
+
+    placement_with_sharding = None
+    for dim in range(param.ndim):
+        if dim != shard_axis:
+            placement_with_sharding = dist.Shard(dim)
+
+    new_placements = param.placements
+    for mesh_axis, placement in enumerate(param.placements):
+        # we need to keep the placement replicate if the it is out of tensor's dim
+        if (
+            isinstance(placement, dist.Replicate)
+            and placement_with_sharding is not None
+        ):
+            new_placements[mesh_axis] = placement_with_sharding
+            break
+
+    return new_placements
+
+
 class _ShardOptimizer:
     def __init__(self, optimizer, shard_fn=None):
         assert (
@@ -576,6 +605,13 @@ def __init__(self, optimizer, shard_fn=None):
         self._inner_opt = optimizer
         self._shard_fn = shard_fn
 
+        # Invoke shard_fn if it is not None to shard parameters
+        if self._shard_fn is not None and isinstance(
+            self._shard_fn, ShardingStage3
+        ):
+            for param in self._inner_opt._parameter_list:
+                self._shard_fn._shard_parameter(param)
+
     def _shard_accumulator(self, param):
         # create the accumulators
         self._inner_opt._create_accumulators(self.target_block, [param])
@@ -733,6 +769,134 @@ def __getattr__(self, item):
         return getattr(self._inner_opt, item)
 
 
+class ShardingStage1:
+    """
+    A builtin shard_fn for shard_optimizer interface, users can pass it to shard_optimizer to implement sharding optimization with stage 1.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+            >>> class MLP(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.fc1 = paddle.nn.Linear(8, 8)
+            ...         self.fc2 = paddle.nn.Linear(8, 8)
+            ...
+            ...     def forward(self, input):
+            ...         return self.fc2(self.fc1(input))
+
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> layer = MLP()
+            >>> batch = paddle.rand(shape=[8, 8])
+            >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
+            >>> opt = dist.shard_optimizer(opt, dist.ShardingStage1())
+            >>> for _ in range(5):
+            >>>     loss = layer(batch)
+            >>>     loss.backward()
+            >>>     opt.step()
+            >>>     opt.clear_grad()
+            >>> # This case need to be executed in multi-card environment
+            >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py
+    """
+
+    def __call__(self, key, param, accumulator):
+        if param.is_dist():
+            # Only deal with momentum in optimizer, beta should be replicated cross param's mesh
+            if 'beta' not in key:
+                placements = get_placement_with_sharding(param)
+            else:
+                placements = [
+                    dist.Replicate()
+                    for _ in range(len(param.process_mesh.shape))
+                ]
+            return shard_tensor(
+                accumulator,
+                mesh=param.process_mesh,
+                placements=placements,
+            )
+        return accumulator
+
+
+class ShardingStage3:
+    """
+    A builtin shard_fn for shard_optimizer interface, users can pass it to shard_optimizer to implement sharding optimization with stage 3.
+
+    Args:
+        mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.distributed as dist
+
+            >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+            >>> class MLP(paddle.nn.Layer):
+            ...     def __init__(self):
+            ...         super().__init__()
+            ...         self.fc1 = paddle.nn.Linear(8, 8)
+            ...         self.fc2 = paddle.nn.Linear(8, 8)
+            ...
+            ...     def forward(self, input):
+            ...         return self.fc2(self.fc1(input))
+
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> layer = MLP()
+            >>> batch = paddle.rand(shape=[8, 8])
+            >>> opt = paddle.optimizer.AdamW(parameters=layer.parameters())
+            >>> opt = dist.shard_optimizer(opt, dist.ShardingStage3(mesh))
+            >>> for _ in range(5):
+            >>>     loss = layer(batch)
+            >>>     loss.backward()
+            >>>     opt.step()
+            >>>     opt.clear_grad()
+            >>> # This case need to be executed in multi-card environment
+            >>> # python -m paddle.distributed.launch --gpus=0,1 {test_case}.py
+    """
+
+    def __init__(self, mesh):
+        self._mesh = mesh
+
+    def _shard_parameter(self, param):
+        # TODO(liyurui): remove this trick dense to dist convert after adding
+        # dense_tensor.to_dist method.
+        if param.is_dense():
+            zero_dense = paddle.zeros(param.shape)
+            placements = []
+            for _ in range(len(self._mesh.shape)):
+                placements.append(dist.Replicate())
+            zero_dist = dist.shard_tensor(zero_dense, self._mesh, placements)
+            res = param + zero_dist
+
+        new_placements = get_placement_with_sharding(param)
+        shard_param = dist.reshard(param, param.process_mesh, new_placements)
+        # change the holder of param to new shard_param
+        param.get_tensor()._share_data_with(shard_param.get_tensor())
+
+    def __call__(self, key, param, accumulator):
+        if param.is_dist():
+            # Only deal with momentum in optimizer, beta should be replicated cross param's mesh
+            if 'beta' not in key:
+                placements = param.placements
+            else:
+                placements = [
+                    dist.Replicate()
+                    for _ in range(len(param.process_mesh.shape))
+                ]
+            return shard_tensor(
+                accumulator,
+                mesh=param.process_mesh,
+                placements=placements,
+            )
+        return accumulator
+
+
 def shard_optimizer(optimizer, shard_fn=None):
     """
 
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 1bc0398fa500f..a72e7831e1a13 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -166,6 +166,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
                   test_semi_auto_parallel_single_strategy)
   set_tests_properties(test_semi_auto_parallel_single_strategy
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 400)
+  py_test_modules(test_semi_auto_parallel_sharding_strategy MODULES
+                  test_semi_auto_parallel_sharding_strategy)
+  set_tests_properties(test_semi_auto_parallel_sharding_strategy
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 200)
   py_test_modules(test_semi_auto_parallel_lazy_init MODULES
                   test_semi_auto_parallel_lazy_init)
   set_tests_properties(test_semi_auto_parallel_lazy_init
diff --git a/test/auto_parallel/hybrid_strategy/CMakeLists.txt b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
index 3fbd643528480..2d205031a433e 100644
--- a/test/auto_parallel/hybrid_strategy/CMakeLists.txt
+++ b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
@@ -50,3 +50,11 @@ if((WITH_GPU) AND (LINUX))
   set_tests_properties(test_semi_auto_parallel_llama_model_amp
                        PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=HYBRID")
 endif()
+if((WITH_GPU) AND (LINUX))
+  py_test_modules(
+    test_semi_auto_parallel_hybrid_sharding_strategy MODULES
+    test_semi_auto_parallel_hybrid_sharding_strategy ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_semi_auto_parallel_hybrid_sharding_strategy
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=HYBRID")
+endif()
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_1.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_1.py
new file mode 100644
index 0000000000000..10b53fa0f443c
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_1.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestSemiAutoParallelShardingStage1:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+
+    def check_tensor_eq(self, a, b, rtol=1e-05, atol=0, verbose=True):
+        np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, verbose=verbose)
+
+    def shard_layer_fn(self, layer_name, layer, process_mesh):
+        layer.weight = dist.shard_tensor(
+            layer.weight, process_mesh, [dist.Shard(1)]
+        )
+        layer.bias = dist.shard_tensor(
+            layer.bias, process_mesh, [dist.Shard(0)]
+        )
+
+    def get_single_card_rst(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.weight = linear.weight.numpy()
+        self.bias = linear.bias.numpy()
+
+    def test_sharding_stage_1_with_mp(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        linear = dist.shard_layer(linear, self._mesh, self.shard_layer_fn)
+        batch = paddle.rand(shape=[10, 10])
+        # shard the input by sharding degree
+        batch = dist.shard_tensor(batch, self._mesh, [dist.Shard(0)])
+        # shard optimizer with stage 1 fn
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        opt = dist.shard_optimizer(opt, dist.ShardingStage1())
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.check_tensor_eq(self.weight, linear.weight.numpy())
+        self.check_tensor_eq(self.bias, linear.bias.numpy())
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.get_single_card_rst()
+        self.test_sharding_stage_1_with_mp()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallelShardingStage1().run_test_case()
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_3.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_3.py
new file mode 100644
index 0000000000000..143e1963c5041
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_sharding_stage_3.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestSemiAutoParallelShardingStage3:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+
+    def check_tensor_eq(self, a, b, rtol=1e-05, atol=0, verbose=True):
+        np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, verbose=verbose)
+
+    def shard_layer_fn(self, layer_name, layer, process_mesh):
+        layer.weight = dist.shard_tensor(
+            layer.weight, process_mesh, [dist.Shard(1)]
+        )
+        layer.bias = dist.shard_tensor(
+            layer.bias, process_mesh, [dist.Shard(0)]
+        )
+
+    def get_single_card_rst(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.weight = linear.weight.numpy()
+        self.bias = linear.bias.numpy()
+
+    def test_sharding_stage_3_with_mp(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        linear = dist.shard_layer(linear, self._mesh, self.shard_layer_fn)
+        batch = paddle.rand(shape=[10, 10])
+        # shard the input by sharding degree
+        batch = dist.shard_tensor(batch, self._mesh, [dist.Shard(0)])
+        # shard optimizer with stage 1 fn
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        opt = dist.shard_optimizer(opt, dist.ShardingStage3(self._mesh))
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.check_tensor_eq(self.weight, linear.weight.numpy())
+        self.check_tensor_eq(self.bias, linear.bias.numpy())
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.get_single_card_rst()
+        self.test_sharding_stage_3_with_mp()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallelShardingStage3().run_test_case()
diff --git a/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_sharding_strategy.py b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_sharding_strategy.py
new file mode 100644
index 0000000000000..e358c18ba2a21
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/test_semi_auto_parallel_hybrid_sharding_strategy.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestSemiAutoParallelInShardingStrategy(
+    test_base.CommunicationTestDistBase
+):
+    def setUp(self):
+        super().setUp(
+            num_of_devices=4,
+            timeout=120,
+        )
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "2023",
+        }
+        self._changeable_envs = {"backend": ["gpu"]}
+
+    def test_sharding_stage_1_strategy(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_sharding_stage_1.py",
+                user_defined_envs=envs,
+            )
+
+    def test_sharding_stage_3_strategy(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_sharding_stage_3.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/hybrid_strategy/testslist.csv b/test/auto_parallel/hybrid_strategy/testslist.csv
index 29ae9dd9dce18..7b64e2d93ea6b 100644
--- a/test/auto_parallel/hybrid_strategy/testslist.csv
+++ b/test/auto_parallel/hybrid_strategy/testslist.csv
@@ -5,3 +5,4 @@ test_save_load_state_dict,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;http
 test_semi_auto_parallel_c_cross_entropy,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_cross_mesh_reshard,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_semi_auto_parallel_llama_model_amp,LINUX,GPU,180,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_semi_auto_parallel_hybrid_sharding_strategy,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
diff --git a/test/auto_parallel/semi_auto_parallel_sharding_stage_1.py b/test/auto_parallel/semi_auto_parallel_sharding_stage_1.py
new file mode 100644
index 0000000000000..ffe1d5725f1d1
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_sharding_stage_1.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestSemiAutoParallelShardingStage1:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+    def check_tensor_eq(self, a, b, rtol=1e-05, atol=0, verbose=True):
+        np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, verbose=verbose)
+
+    def get_single_card_rst(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.weight = linear.weight.numpy()
+        self.bias = linear.bias.numpy()
+
+    def test_pure_sharding_stage_1(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        # shard the input by sharding degree
+        batch = dist.shard_tensor(batch, self._mesh, [dist.Shard(0)])
+        # shard optimizer with stage 1 fn
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        opt = dist.shard_optimizer(opt, dist.ShardingStage1())
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.check_tensor_eq(self.weight, linear.weight.numpy())
+        self.check_tensor_eq(self.bias, linear.bias.numpy())
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.get_single_card_rst()
+        self.test_pure_sharding_stage_1()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallelShardingStage1().run_test_case()
diff --git a/test/auto_parallel/semi_auto_parallel_sharding_stage_3.py b/test/auto_parallel/semi_auto_parallel_sharding_stage_3.py
new file mode 100644
index 0000000000000..f391ca9ef54f2
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_sharding_stage_3.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestSemiAutoParallelShardingStage3:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+    def check_tensor_eq(self, a, b, rtol=1e-05, atol=0, verbose=True):
+        np.testing.assert_allclose(a, b, rtol=rtol, atol=atol, verbose=verbose)
+
+    def get_single_card_rst(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.weight = linear.weight.numpy()
+        self.bias = linear.bias.numpy()
+
+    def test_pure_sharding_stage_3(self):
+        paddle.seed(self._seed)
+        linear = paddle.nn.Linear(10, 10)
+        batch = paddle.rand(shape=[10, 10])
+        # shard the input by sharding degree
+        batch = dist.shard_tensor(batch, self._mesh, [dist.Shard(0)])
+        # shard optimizer with stage 1 fn
+        opt = paddle.optimizer.AdamW(parameters=linear.parameters())
+        opt = dist.shard_optimizer(opt, dist.ShardingStage3(self._mesh))
+        for _ in range(5):
+            loss = linear(batch)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+        self.check_tensor_eq(self.weight, linear.weight.numpy())
+        self.check_tensor_eq(self.bias, linear.bias.numpy())
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.get_single_card_rst()
+        self.test_pure_sharding_stage_3()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallelShardingStage3().run_test_case()
diff --git a/test/auto_parallel/test_semi_auto_parallel_sharding_strategy.py b/test/auto_parallel/test_semi_auto_parallel_sharding_strategy.py
new file mode 100644
index 0000000000000..489cba334c1b0
--- /dev/null
+++ b/test/auto_parallel/test_semi_auto_parallel_sharding_strategy.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestSemiAutoParallelInShardingStrategy(
+    test_base.CommunicationTestDistBase
+):
+    def setUp(self):
+        super().setUp(
+            num_of_devices=2,
+            timeout=120,
+        )
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "2023",
+        }
+        self._changeable_envs = {"backend": ["gpu"]}
+
+    def test_sharding_stage_1_strategy(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_sharding_stage_1.py",
+                user_defined_envs=envs,
+            )
+
+    def test_sharding_stage_3_strategy(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_sharding_stage_3.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 23fdbd170a0fb6944c6a0f404fb8610b20c563ba Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Fri, 23 Feb 2024 17:39:59 +0800
Subject: [PATCH 40/82] [XPU] AdamW support multi_precision (#61694)

* [XPU] AdamW support multi_precision

* [XPU] use xdnn api adamw_v2

* update for KL2
---
 cmake/external/xpu.cmake                      |   2 +-
 paddle/phi/backends/xpu/xpu2_op_list.cc       |   8 +-
 paddle/phi/backends/xpu/xpu3_op_list.cc       |   3 +-
 paddle/phi/kernels/xpu/adamw_kernel.cc        | 383 ++++++++++++++++--
 .../kernels/xpu/reduce_mean_grad_kernel.cc    |   8 +-
 python/paddle/optimizer/adamw.py              |   4 +-
 test/xpu/test_adamw_op_xpu.py                 | 198 ++++++++-
 test/xpu/test_flash_attention_op_xpu.py       |   2 +-
 8 files changed, 566 insertions(+), 42 deletions(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 73210ac9fbc56..bd2471e0f7e1d 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -29,7 +29,7 @@ if(NOT DEFINED XPU_BASE_DATE)
   set(XPU_BASE_DATE "20240104")
 endif()
 if(NOT DEFINED XPU_XHPC_BASE_DATE)
-  set(XPU_XHPC_BASE_DATE "20240218")
+  set(XPU_XHPC_BASE_DATE "20240222")
 endif()
 set(XPU_XCCL_BASE_VERSION "1.1.8.1")
 if(NOT DEFINED XPU_XFT_BASE_VERSION)
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 491b47442725a..55aae9f24c1a6 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -36,7 +36,10 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"accuracy", XPUKernelSet({phi::DataType::FLOAT32})},
       {"adadelta", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"adamw", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"adamw",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"adam", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"adam_dense_param_sparse_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
@@ -723,7 +726,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT32,
                      phi::DataType::INT64,
                      phi::DataType::FLOAT16})},
-      {"reduce_mean_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_mean_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"reduce_mean",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
index 1d3d6001bca9c..39e79ba0c4934 100644
--- a/paddle/phi/backends/xpu/xpu3_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -721,7 +721,8 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::BFLOAT16,
                      phi::DataType::INT32,
                      phi::DataType::INT64})},
-      {"reduce_mean_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"reduce_mean_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"reduce_mean",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
diff --git a/paddle/phi/kernels/xpu/adamw_kernel.cc b/paddle/phi/kernels/xpu/adamw_kernel.cc
index 4df7ab633ab4e..ca39a9932a609 100644
--- a/paddle/phi/kernels/xpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/xpu/adamw_kernel.cc
@@ -24,6 +24,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 
+#include "paddle/phi/common/amp_type_traits.h"
+
 namespace phi {
 
 template <typename Context>
@@ -44,6 +46,234 @@ float GetAbsMax(const Context& dev_ctx,
   return *std::max_element(buffer_cpu.begin(), buffer_cpu.end());
 }
 
+template <typename T, typename Context>
+void AdamwDenseKernelKL3(const Context& dev_ctx,
+                         const DenseTensor& param,
+                         const DenseTensor& grad,
+                         const DenseTensor& learning_rate,
+                         const DenseTensor& moment1,
+                         const DenseTensor& moment2,
+                         const DenseTensor& beta1_pow,
+                         const DenseTensor& beta2_pow,
+                         const paddle::optional<DenseTensor>& master_param,
+                         const paddle::optional<DenseTensor>& skip_update,
+                         const Scalar& beta1,
+                         const Scalar& beta2,
+                         const Scalar& epsilon,
+                         float lr_ratio,
+                         float coeff,
+                         bool with_decay,
+                         bool lazy_mode,
+                         int64_t min_row_size_to_use_multithread,
+                         bool multi_precision,
+                         bool use_global_beta_pow,
+                         DenseTensor* param_out,
+                         DenseTensor* moment1_out,
+                         DenseTensor* moment2_out,
+                         DenseTensor* beta1_pow_out,
+                         DenseTensor* beta2_pow_out,
+                         DenseTensor* master_param_outs) {
+  // TODO(houj04):
+  // 当KL3稳定以后，并且不需要支持KL1和KL2的时候，拿这里的AdamwDenseKernelKL3替换掉AdamwDenseKernel
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  const auto grad_type = grad.dtype();
+
+  VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+  MPDType coeff_ = static_cast<MPDType>(coeff);
+  MPDType lr_ratio_ = static_cast<MPDType>(lr_ratio);
+
+  bool skip_update_ = false;
+  if (skip_update.is_initialized()) {
+    PADDLE_ENFORCE_EQ(
+        skip_update->numel(),
+        1,
+        errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
+                                skip_update->numel()));
+    std::vector<bool> skip_update_vec;
+    phi::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
+    skip_update_ = skip_update_vec[0];
+  }
+
+  // skip_update=true, just copy input to output
+  if (skip_update_) {
+    VLOG(4) << "Adamw skip update";
+    phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
+    phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
+    phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
+    if (!use_global_beta_pow) {
+      phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out);
+      phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out);
+    }
+    return;
+  }
+
+  // if with_decay = false, coeff = 0
+  if (!with_decay) {
+    coeff_ = static_cast<MPDType>(0.0);
+  }
+
+  MPDType beta1_ = beta1.to<MPDType>();
+  MPDType beta2_ = beta2.to<MPDType>();
+  MPDType epsilon_ = epsilon.to<MPDType>();
+  VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel()
+          << "beta2_pow.numel() : " << beta2_pow.numel();
+  VLOG(3) << "param.numel(): " << param.numel();
+  PADDLE_ENFORCE_EQ(
+      beta1_pow_out->numel(),
+      1,
+      errors::InvalidArgument("beta1 pow output size should be 1, but received "
+                              "value is:%d.",
+                              beta1_pow_out->numel()));
+
+  PADDLE_ENFORCE_EQ(
+      beta2_pow_out->numel(),
+      1,
+      errors::InvalidArgument("beta2 pow output size should be 1, but received "
+                              "value is:%d.",
+                              beta2_pow_out->numel()));
+
+  const MPDType* master_in_data =
+      multi_precision ? master_param->data<MPDType>() : nullptr;
+  MPDType* master_out_data =
+      multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_outs)
+                      : nullptr;
+  // template <typename T, typename TG, typename MT> DLL_EXPORT int
+  // adamw_v2(Context* ctx, MT beta1, MT beta2, MT epsilon, MT coeff, MT
+  // lr_ratio, const MT* beta1_pow, MT* beta1_pow_out, const MT* beta2_pow, MT*
+  // beta2_pow_out, const MT* moment1, MT* moment1_out, const MT* moment2, MT*
+  // moment2_out, const MT* lr, const TG* grad, const T* param, T* param_out,
+  // const MT* master_param, MT* master_param_out, int64_t n);
+
+  if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
+    DenseTensor xpu_beta1_pow;
+    DenseTensor xpu_beta2_pow;
+    phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, &xpu_beta1_pow);
+    phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, &xpu_beta2_pow);
+    dev_ctx.Wait();
+    const MPDType* beta1_pow_ptr = xpu_beta1_pow.data<MPDType>();
+    const MPDType* beta2_pow_ptr = xpu_beta2_pow.data<MPDType>();
+
+    if (grad_type == phi::DataType::FLOAT32) {
+      int r = xpu::adamw_v2<XPUType, float, MPDType>(
+          dev_ctx.x_context(),
+          beta1_,
+          beta2_,
+          epsilon_,
+          coeff_,
+          lr_ratio_,
+          beta1_pow_ptr,
+          nullptr,
+          beta2_pow_ptr,
+          nullptr,
+          moment1.data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment1_out),
+          moment2.data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment2_out),
+          learning_rate.data<MPDType>(),
+          grad.data<float>(),
+          reinterpret_cast<const XPUType*>(param.data<T>()),
+          reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
+          master_in_data,
+          master_out_data,
+          param.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
+    } else {
+      int r = xpu::adamw_v2<XPUType, XPUType, MPDType>(
+          dev_ctx.x_context(),
+          beta1_,
+          beta2_,
+          epsilon_,
+          coeff_,
+          lr_ratio_,
+          beta1_pow_ptr,
+          nullptr,
+          beta2_pow_ptr,
+          nullptr,
+          moment1.data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment1_out),
+          moment2.data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment2_out),
+          learning_rate.data<MPDType>(),
+          reinterpret_cast<const XPUType*>(grad.data<T>()),
+          reinterpret_cast<const XPUType*>(param.data<T>()),
+          reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
+          master_in_data,
+          master_out_data,
+          param.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
+    }
+    if (!use_global_beta_pow) {
+      // Cpu update
+      dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
+          beta1_ * beta1_pow.data<MPDType>()[0];
+      dev_ctx.template HostAlloc<MPDType>(beta2_pow_out)[0] =
+          beta2_ * beta2_pow.data<MPDType>()[0];
+    }
+  } else {
+    MPDType* beta1_pow_out_ptr = nullptr;
+    MPDType* beta2_pow_out_ptr = nullptr;
+
+    if (!use_global_beta_pow) {
+      beta1_pow_out_ptr = dev_ctx.template Alloc<MPDType>(beta1_pow_out);
+      beta2_pow_out_ptr = dev_ctx.template Alloc<MPDType>(beta2_pow_out);
+    }
+
+    if (grad_type == phi::DataType::FLOAT32) {
+      int r = xpu::adamw_v2<XPUType, float, MPDType>(
+          dev_ctx.x_context(),
+          beta1_,
+          beta2_,
+          epsilon_,
+          coeff_,
+          lr_ratio_,
+          beta1_pow.data<MPDType>(),
+          beta1_pow_out_ptr,
+          beta2_pow.data<MPDType>(),
+          beta2_pow_out_ptr,
+          moment1.data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment1_out),
+          moment2.data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment2_out),
+          learning_rate.data<MPDType>(),
+          grad.data<float>(),
+          reinterpret_cast<const XPUType*>(param.data<T>()),
+          reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
+          master_in_data,
+          master_out_data,
+          param.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
+    } else {
+      int r = xpu::adamw_v2<XPUType, XPUType, MPDType>(
+          dev_ctx.x_context(),
+          beta1_,
+          beta2_,
+          epsilon_,
+          coeff_,
+          lr_ratio_,
+          beta1_pow.data<MPDType>(),
+          beta1_pow_out_ptr,
+          beta2_pow.data<MPDType>(),
+          beta2_pow_out_ptr,
+          moment1.data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment1_out),
+          moment2.data<MPDType>(),
+          dev_ctx.template Alloc<MPDType>(moment2_out),
+          learning_rate.data<MPDType>(),
+          reinterpret_cast<const XPUType*>(grad.data<T>()),
+          reinterpret_cast<const XPUType*>(param.data<T>()),
+          reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
+          master_in_data,
+          master_out_data,
+          param.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
+    }
+  }
+  return;
+}
+
 template <typename T, typename Context>
 void AdamwDenseKernel(const Context& dev_ctx,
                       const DenseTensor& param,
@@ -71,6 +301,38 @@ void AdamwDenseKernel(const Context& dev_ctx,
                       DenseTensor* beta1_pow_out,
                       DenseTensor* beta2_pow_out,
                       DenseTensor* master_param_outs) {
+  auto dev_version =
+      phi::backends::xpu::get_xpu_version(dev_ctx.GetPlace().GetDeviceId());
+  if (dev_version == phi::backends::xpu::XPUVersion::XPU3) {
+    AdamwDenseKernelKL3<T, Context>(dev_ctx,
+                                    param,
+                                    grad,
+                                    learning_rate,
+                                    moment1,
+                                    moment2,
+                                    beta1_pow,
+                                    beta2_pow,
+                                    master_param,
+                                    skip_update,
+                                    beta1,
+                                    beta2,
+                                    epsilon,
+                                    lr_ratio,
+                                    coeff,
+                                    with_decay,
+                                    lazy_mode,
+                                    min_row_size_to_use_multithread,
+                                    multi_precision,
+                                    use_global_beta_pow,
+                                    param_out,
+                                    moment1_out,
+                                    moment2_out,
+                                    beta1_pow_out,
+                                    beta2_pow_out,
+                                    master_param_outs);
+    return;
+  }
+
   // check moment_dtype
   auto moment1_dtype = moment1.dtype();
   auto moment2_dtype = moment2.dtype();
@@ -228,30 +490,85 @@ void AdamwDenseKernel(const Context& dev_ctx,
                  0.0f);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
 
-  // int adamw(Context* ctx, const T* g, const float* mom1, const float* mom2,
-  // const T* param, const float* beta1_pow, const float* beta2_pow, const
-  // float* lr, float* moment1_out, float* moment2_out, T* param_out, float
-  // beta1, float beta2, float epsilon, float coeff, int64_t n);
-  r = xpu::adamw(
-      dev_ctx.x_context(),
-      reinterpret_cast<const XPUType*>(grad.template data<T>()),
-      moment_in_fp16 ? moment1_input_for_xdnn : moment1.template data<float>(),
-      moment_in_fp16 ? moment2_input_for_xdnn : moment2.template data<float>(),
-      reinterpret_cast<const XPUType*>(param.template data<T>()),
-      beta1_pow_ptr,
-      beta2_pow_ptr,
-      new_lr,
-      moment_in_fp16 ? moment1_output_for_xdnn
-                     : dev_ctx.template Alloc<float>(moment1_out),
-      moment_in_fp16 ? moment2_output_for_xdnn
-                     : dev_ctx.template Alloc<float>(moment2_out),
-      reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
-      beta1_,
-      beta2_,
-      epsilon_,
-      coeff,
-      param.numel());
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
+  if (multi_precision) {
+    const float* master_param_in_data = master_param->data<float>();
+    float* master_param_out_data =
+        dev_ctx.template Alloc<float>(master_param_outs);
+    // convert grad to float if necessary
+    float* grad_fp32 = nullptr;
+    const auto grad_type = grad.dtype();
+    if (grad_type != phi::DataType::FLOAT32) {
+      grad_fp32 = RAII_GUARD.alloc_l3_or_gm<float>(grad.numel());
+      PADDLE_ENFORCE_XDNN_NOT_NULL(grad_fp32);
+      // int cast(Context* ctx, const TX* x, TY* y, int64_t len);
+      int r = xpu::cast<XPUType, float>(
+          dev_ctx.x_context(),
+          reinterpret_cast<const XPUType*>(grad.template data<T>()),
+          grad_fp32,
+          grad.numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+    }
+    // int adamw(Context* ctx, const T* g, const float* mom1, const float* mom2,
+    // const T* param, const float* beta1_pow, const float* beta2_pow, const
+    // float* lr, float* moment1_out, float* moment2_out, T* param_out, float
+    // beta1, float beta2, float epsilon, float coeff, int64_t n);
+    r = xpu::adamw<float>(
+        dev_ctx.x_context(),
+        (grad_type == phi::DataType::FLOAT32) ? grad.data<float>() : grad_fp32,
+        moment_in_fp16 ? moment1_input_for_xdnn
+                       : moment1.template data<float>(),
+        moment_in_fp16 ? moment2_input_for_xdnn
+                       : moment2.template data<float>(),
+        master_param_in_data,
+        beta1_pow_ptr,
+        beta2_pow_ptr,
+        new_lr,
+        moment_in_fp16 ? moment1_output_for_xdnn
+                       : dev_ctx.template Alloc<float>(moment1_out),
+        moment_in_fp16 ? moment2_output_for_xdnn
+                       : dev_ctx.template Alloc<float>(moment2_out),
+        master_param_out_data,
+        beta1_,
+        beta2_,
+        epsilon_,
+        coeff,
+        param.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
+    // convert master_param_out(fp32) to param_out(T)
+    r = xpu::cast<float, XPUType>(
+        dev_ctx.x_context(),
+        master_param_out_data,
+        reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
+        param_out->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+  } else {
+    // int adamw(Context* ctx, const T* g, const float* mom1, const float* mom2,
+    // const T* param, const float* beta1_pow, const float* beta2_pow, const
+    // float* lr, float* moment1_out, float* moment2_out, T* param_out, float
+    // beta1, float beta2, float epsilon, float coeff, int64_t n);
+    r = xpu::adamw(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType*>(grad.template data<T>()),
+        moment_in_fp16 ? moment1_input_for_xdnn
+                       : moment1.template data<float>(),
+        moment_in_fp16 ? moment2_input_for_xdnn
+                       : moment2.template data<float>(),
+        reinterpret_cast<const XPUType*>(param.template data<T>()),
+        beta1_pow_ptr,
+        beta2_pow_ptr,
+        new_lr,
+        moment_in_fp16 ? moment1_output_for_xdnn
+                       : dev_ctx.template Alloc<float>(moment1_out),
+        moment_in_fp16 ? moment2_output_for_xdnn
+                       : dev_ctx.template Alloc<float>(moment2_out),
+        reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
+        beta1_,
+        beta2_,
+        epsilon_,
+        coeff,
+        param.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
+  }
 
   if (moment_in_fp16) {
     int r = 0;
@@ -369,11 +686,15 @@ PD_REGISTER_KERNEL(adamw,
   kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
   kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
   kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND);
-  // Skip beta1_pow, beta2_pow, skip_update data transform
-  kernel->OutputAt(3)
-      .SetBackend(phi::Backend::UNDEFINED)
-      .SetDataType(phi::DataType::FLOAT32);
-  kernel->OutputAt(4)
-      .SetBackend(phi::Backend::UNDEFINED)
-      .SetDataType(phi::DataType::FLOAT32);
+
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32);
+  }
+  kernel->OutputAt(3).SetBackend(phi::Backend::UNDEFINED);
+  kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED);
 }
diff --git a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
index c5b0950552629..37ace904b2b80 100644
--- a/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/reduce_mean_grad_kernel.cc
@@ -84,5 +84,9 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    mean_grad, XPU, ALL_LAYOUT, phi::ReduceMeanGradKernel, float) {}
+PD_REGISTER_KERNEL(mean_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ReduceMeanGradKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index b14f8603be89e..f3a23ce846bf1 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -177,9 +177,9 @@ def __init__(
         assert epsilon is not None
         if not isinstance(beta1, Value) and not 0 <= beta1 < 1:
             raise ValueError("Invalid value of beta1, expect beta1 in [0,1).")
-        if not isinstance(beta1, Value) and not 0 <= beta2 < 1:
+        if not isinstance(beta2, Value) and not 0 <= beta2 < 1:
             raise ValueError("Invalid value of beta2, expect beta2 in [0,1).")
-        if not isinstance(beta1, Value) and not 0 <= epsilon:
+        if not isinstance(epsilon, Value) and not 0 <= epsilon:
             raise ValueError("Invalid value of epsilon, expect epsilon >= 0.")
         if not isinstance(weight_decay, float) and not isinstance(
             weight_decay, (framework.Variable, Value)
diff --git a/test/xpu/test_adamw_op_xpu.py b/test/xpu/test_adamw_op_xpu.py
index b9120779c40f6..f8e0b7cd545bf 100644
--- a/test/xpu/test_adamw_op_xpu.py
+++ b/test/xpu/test_adamw_op_xpu.py
@@ -59,8 +59,8 @@ def adamw_step(inputs, attributes):
 
     moment1_out = beta1 * moment1 + (1 - beta1) * grad
     moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
-    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
-    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
+    denom = (np.sqrt(moment2_out) / np.sqrt(1.0 - beta2_pow)) + epsilon
+    param_out = param + ((moment1_out / denom) * (-(lr / (1.0 - beta1_pow))))
     return param_out, moment1_out, moment2_out
 
 
@@ -650,6 +650,200 @@ def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
             paddle.disable_static()
 
 
+class TestAdamWOpMultiPrecisonWithMainGrad(unittest.TestCase):
+    def _test_adamw_op_dygraph_place_amp_with_maingrad(
+        self, place, shape, use_main_grad
+    ):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device(place)
+
+        found_inf = None
+
+        _weight_decay = 0.1
+        with_decay = True
+        _lazy_mode = False
+        find_master = True
+
+        _epsilon = 1e-8
+
+        _beta1 = 0.9
+        _beta2 = 0.99
+        lr_ratio_ = 1.0
+
+        lr_rate = 1e-8
+
+        param = paddle.randn(shape).astype(paddle.bfloat16)
+        master_weight = param.astype(paddle.float32)
+        grad = paddle.randn(shape).astype(paddle.bfloat16)
+        main_grad = grad.astype(paddle.float32)
+        moment1 = paddle.randn(shape).astype(paddle.float32)
+        moment2 = paddle.randn(shape).astype(paddle.float32).abs()
+        lr = paddle.zeros([1]).astype(paddle.float32)
+        lr[0] = lr_rate
+        beta1_pow_acc = paddle.ones([1]).astype(paddle.float32)
+        beta1_pow_acc[0] = _beta1**10
+        beta2_pow_acc = paddle.ones([1]).astype(paddle.float32)
+        beta2_pow_acc[0] = _beta2**10
+
+        ref_param = param.astype(paddle.float32)
+        ref_beta1_pow_acc = beta1_pow_acc.astype(paddle.float32)
+        ref_beta2_pow_acc = beta2_pow_acc.astype(paddle.float32)
+        ref_moment_1 = moment1.astype(paddle.float32)
+        ref_moment_2 = moment2.astype(paddle.float32)
+
+        # reference code
+        _, _, _, _, _, _ = paddle._C_ops.adamw_(
+            ref_param,
+            main_grad,
+            lr,
+            ref_moment_1,
+            ref_moment_2,
+            ref_beta1_pow_acc,
+            ref_beta2_pow_acc,
+            master_weight,
+            found_inf,
+            _beta1,
+            _beta2,
+            _epsilon,
+            lr_ratio_,
+            _weight_decay,
+            with_decay,
+            _lazy_mode,
+            1000,
+            False,
+            False,
+        )
+
+        if use_main_grad:
+            _, _, _, _, _, _ = paddle._C_ops.adamw_(
+                param,
+                main_grad,
+                lr,
+                moment1,
+                moment2,
+                beta1_pow_acc,
+                beta2_pow_acc,
+                master_weight,
+                found_inf,
+                _beta1,
+                _beta2,
+                _epsilon,
+                lr_ratio_,
+                _weight_decay,
+                with_decay,
+                _lazy_mode,
+                1000,
+                find_master,
+                False,
+            )
+            np.testing.assert_allclose(
+                param.astype("float32").numpy(), ref_param.numpy(), rtol=1e-2
+            )
+            np.testing.assert_allclose(
+                master_weight.numpy(), ref_param.numpy(), rtol=1e-6
+            )
+        else:
+            _, _, _, _, _, _ = paddle._C_ops.adamw_(
+                param,
+                grad,
+                lr,
+                moment1,
+                moment2,
+                beta1_pow_acc,
+                beta2_pow_acc,
+                master_weight,
+                found_inf,
+                _beta1,
+                _beta2,
+                _epsilon,
+                lr_ratio_,
+                _weight_decay,
+                with_decay,
+                _lazy_mode,
+                1000,
+                find_master,
+                False,
+            )
+            np.testing.assert_allclose(
+                param.astype("float32").numpy(), ref_param.numpy(), rtol=1e-2
+            )
+            np.testing.assert_allclose(
+                master_weight.numpy(), ref_param.numpy(), rtol=1e-6
+            )
+
+    def _get_places(self):
+        places = []
+        if paddle.is_compiled_with_xpu():
+            places.append('xpu')
+        return places
+
+    def test_main(self):
+        for _ in range(1):
+            shape = paddle.randint(1, 1024, [2])
+            for place in self._get_places():
+                use_main_grad_list = [True, False]
+                for use_main_grad in use_main_grad_list:
+                    self._test_adamw_op_dygraph_place_amp_with_maingrad(
+                        place, shape, use_main_grad
+                    )
+
+
+class TestAdamWOpMultiPrecison(unittest.TestCase):
+    def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device(place)
+
+        input = paddle.randn((5, 5))
+
+        model = paddle.nn.Linear(5, 5)
+
+        optimizer = paddle.optimizer.AdamW(
+            parameters=[
+                {
+                    'params': model.parameters(),
+                    'weight_decay': 0.001,
+                    'beta1': 0.1,
+                    'beta2': 0.99,
+                }
+            ],
+            multi_precision=use_amp,
+        )
+
+        for idx in range(2):
+            if place == 'xpu' and use_amp:
+                model = paddle.amp.decorate(models=model, level='O2')
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+            if place == 'xpu' and use_amp:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.step(optimizer)
+                optimizer.clear_grad()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                loss.backward()
+                optimizer.step()
+                optimizer.clear_grad()
+
+    def _get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_xpu():
+            places.append('xpu')
+        return places
+
+    def test_main(self):
+        for place in self._get_places():
+            use_amp_list = [True, False]
+            for use_amp in use_amp_list:
+                self._test_adamw_op_dygraph_place_amp(place, use_amp)
+
+
 support_types = get_xpu_op_support_types('adamw')
 for stype in support_types:
     create_test_class(globals(), XPUTestAdamwOp1, stype)
diff --git a/test/xpu/test_flash_attention_op_xpu.py b/test/xpu/test_flash_attention_op_xpu.py
index 8aadadfc40ecc..372a2ee91f1dd 100644
--- a/test/xpu/test_flash_attention_op_xpu.py
+++ b/test/xpu/test_flash_attention_op_xpu.py
@@ -79,7 +79,7 @@ def setUp(self):
     def test_all(self):
         self.run_case(dtype="float32", tolerance=5e-4, tolerance_dv=5e-4)
         self.run_case(dtype="float16", tolerance=5e-4, tolerance_dv=1e-3)
-        self.run_case(dtype="bfloat16", tolerance=5e-3, tolerance_dv=1e-2)
+        self.run_case(dtype="bfloat16", tolerance=6e-3, tolerance_dv=1e-2)
 
     def run_case(self, dtype, tolerance, tolerance_dv):
         # TODO(houj04) remove debug codes after correctness check

From bd3639b80740dacbe6c754952e1677c4714b1acd Mon Sep 17 00:00:00 2001
From: skywalker2012 <108259496+skywalker2012@users.noreply.github.com>
Date: Fri, 23 Feb 2024 18:16:19 +0800
Subject: [PATCH 41/82] [XPU] add support for local path compilation (#61970)

* [XPU] add support for local path compilation

* [XPU] pre-commit change

* [XPU] remove os dependency
---
 tools/xpu/check_xpu_dependence.sh |  13 +++
 tools/xpu/pack_paddle_depence.sh  | 135 ++++++++++++++++++++++--------
 2 files changed, 111 insertions(+), 37 deletions(-)

diff --git a/tools/xpu/check_xpu_dependence.sh b/tools/xpu/check_xpu_dependence.sh
index 3091205f12ea3..2dffe5fea6824 100644
--- a/tools/xpu/check_xpu_dependence.sh
+++ b/tools/xpu/check_xpu_dependence.sh
@@ -23,6 +23,7 @@ fi
 
 xpu_base_url=$1
 xccl_base_url=$2
+BOS_PATTERN="https://baidu-kunlun-product.su.bcebos.com"
 
 echo "xpu_base_url: $xpu_base_url"
 echo "xccl_base_url: $xccl_base_url"
@@ -73,6 +74,18 @@ function check_files() {
     rm -rf ./$local_dir
 }
 
+# check xpu_base_url type
+if [[ $xpu_base_url != *"$BOS_PATTERN"* ]]; then
+    echo "The xpu_base_url does not contain bos url, assume it is local path"
+    if [[ ! -d $xpu_base_url ]]; then
+        echo "The xpu_base_url does not exist, please check it"
+        exit 1
+    fi
+    exit 0
+else
+    echo "The URL is a bos url, will follow default download & compile logic"
+fi
+
 # XRE
 xre_tar_file_names=("xre-kylin_aarch64" "xre-bdcentos_x86_64" "xre-ubuntu_x86_64" "xre-centos7_x86_64")
 xre_inner_file_names=("include/xpu/runtime.h" "so/libxpurt.so")
diff --git a/tools/xpu/pack_paddle_depence.sh b/tools/xpu/pack_paddle_depence.sh
index 2df7c98cbf48a..25fe0bd1c51b9 100644
--- a/tools/xpu/pack_paddle_depence.sh
+++ b/tools/xpu/pack_paddle_depence.sh
@@ -17,12 +17,6 @@
 set -e
 set -x
 
-if [[ $# -eq 8 ]]; then
-  echo "Compiling Paddle with XHPC"
-  XHPC_URL=$7
-  XHPC_DIR_NAME=$8
-fi
-
 XRE_URL=$1
 XRE_DIR_NAME=$2
 
@@ -32,41 +26,108 @@ XDNN_DIR_NAME=$4
 XCCL_URL=$5
 XCCL_DIR_NAME=$6
 
-wget --no-check-certificate ${XRE_URL} -q -O xre.tar.gz
-tar xvf xre.tar.gz
-
-wget --no-check-certificate ${XDNN_URL} -q -O xdnn.tar.gz
-tar xvf xdnn.tar.gz
+if [[ $# -eq 8 ]]; then
+  echo "Compiling Paddle with XHPC"
+  XHPC_URL=$7
+  XHPC_DIR_NAME=$8
+elif [[ $# -eq 7 ]]; then
+  XHPC_DIR_NAME=$7
+fi
 
-wget --no-check-certificate ${XCCL_URL} -q -O xccl.tar.gz
-tar xvf xccl.tar.gz
+BOS_PATTERN="https://baidu-kunlun-product.su.bcebos.com"
 
+mkdir -p xpu/include/xhpc/xblas
+mkdir -p xpu/include/xhpc/xfa
 mkdir -p xpu/include/xpu
 mkdir -p xpu/lib
 
-if ! [ -z ${XHPC_URL} ]; then
-  echo "Compiling Paddle with XHPC"
-  echo "XHPC_URL: ${XHPC_URL}"
-  wget --no-check-certificate ${XHPC_URL} -q -O xhpc.tar.gz
-  tar xvf xhpc.tar.gz
-
-  mkdir -p xpu/include/xhpc/xblas
-  mkdir -p xpu/include/xhpc/xfa
-
-  cp -r ${XHPC_DIR_NAME}/xblas/include/* xpu/include/xhpc/xblas
-  cp -r ${XHPC_DIR_NAME}/xblas/so/* xpu/lib/
-
-  cp -r ${XHPC_DIR_NAME}/xdnn/include/* xpu/include/
-  cp -r ${XHPC_DIR_NAME}/xdnn/so/* xpu/lib
-
-  cp -r ${XHPC_DIR_NAME}/xfa/include/* xpu/include/xhpc/xfa
-  cp -r ${XHPC_DIR_NAME}/xfa/so/* xpu/lib/
+function download_from_bos() {
+  wget --no-check-certificate ${XRE_URL} -q -O xre.tar.gz
+  tar xvf xre.tar.gz
+
+  wget --no-check-certificate ${XDNN_URL} -q -O xdnn.tar.gz
+  tar xvf xdnn.tar.gz
+
+  wget --no-check-certificate ${XCCL_URL} -q -O xccl.tar.gz
+  tar xvf xccl.tar.gz
+}
+
+function xhpc_prepare() {
+    if ! [ -z ${XHPC_URL} ]; then
+      echo "XHPC_URL: ${XHPC_URL}"
+      wget --no-check-certificate ${XHPC_URL} -q -O xhpc.tar.gz
+      tar xvf xhpc.tar.gz
+
+      cp -r ${XHPC_DIR_NAME}/xblas/include/* xpu/include/xhpc/xblas
+      cp -r ${XHPC_DIR_NAME}/xblas/so/* xpu/lib/
+
+      cp -r ${XHPC_DIR_NAME}/xdnn/include/* xpu/include/
+      cp -r ${XHPC_DIR_NAME}/xdnn/so/* xpu/lib
+
+      cp -r ${XHPC_DIR_NAME}/xfa/include/* xpu/include/xhpc/xfa
+      cp -r ${XHPC_DIR_NAME}/xfa/so/* xpu/lib/
+    else
+      cp -r ${XDNN_DIR_NAME}/include/xpu/* xpu/include/xpu/
+      cp -r ${XDNN_DIR_NAME}/so/* xpu/lib/
+    fi
+}
+
+function local_prepare() {
+    # xre prepare
+    if [[ ! -d ${LOCAL_PATH}/${XRE_DIR_NAME} ]]; then
+        XRE_TAR_NAME=${XRE_DIR_NAME}.tar.gz
+        tar -zxf  ${LOCAL_PATH}/${XRE_TAR_NAME} -C ${LOCAL_PATH}
+    fi
+
+    # xccl prepare
+    if [[ ! -d ${LOCAL_PATH}/${XCCL_DIR_NAME} ]]; then
+        XCCL_TAR_NAME=${XCCL_DIR_NAME}.tar.gz
+        tar -zxf  ${LOCAL_PATH}/${XCCL_TAR_NAME} -C ${LOCAL_PATH}
+    fi
+
+    # xhpc prepare
+    if [[ ! -d ${LOCAL_PATH}/${XHPC_DIR_NAME} ]]; then
+        XHPC_TAR_NAME=${XHPC_DIR_NAME}.tar.gz
+        tar -zxf  ${LOCAL_PATH}/${XHPC_TAR_NAME} -C ${LOCAL_PATH}
+    fi
+}
+
+function local_assemble() {
+    # xre assemble
+    cp -r ${LOCAL_PATH}/$XRE_DIR_NAME/include/xpu/* xpu/include/xpu/
+    cp -r ${LOCAL_PATH}/$XRE_DIR_NAME/so/libxpurt* xpu/lib/
+
+    # xccl assemble
+    cp -r ${LOCAL_PATH}/$XCCL_DIR_NAME/include/* xpu/include/xpu/
+    cp -r ${LOCAL_PATH}/$XCCL_DIR_NAME/so/* xpu/lib/
+
+    # xhpc assemble
+    cp -r ${LOCAL_PATH}/${XHPC_DIR_NAME}/xblas/include/* xpu/include/xhpc/xblas
+    cp -r ${LOCAL_PATH}/${XHPC_DIR_NAME}/xblas/so/* xpu/lib/
+
+    cp -r ${LOCAL_PATH}/${XHPC_DIR_NAME}/xdnn/include/* xpu/include/
+    cp -r ${LOCAL_PATH}/${XHPC_DIR_NAME}/xdnn/so/* xpu/lib
+
+    cp -r ${LOCAL_PATH}/${XHPC_DIR_NAME}/xfa/include/* xpu/include/xhpc/xfa
+    cp -r ${LOCAL_PATH}/${XHPC_DIR_NAME}/xfa/so/* xpu/lib/
+}
+
+if [[ $XRE_URL != *"$BOS_PATTERN"* ]]; then
+    # below is local way
+    build_from="local"
+    LOCAL_PATH=$(dirname "$XRE_URL")
+    echo "LOCAL_PATH: ${LOCAL_PATH}"
+
+    local_prepare
+    local_assemble
 else
-  cp -r $XDNN_DIR_NAME/include/xpu/* xpu/include/xpu/
-  cp -r $XDNN_DIR_NAME/so/* xpu/lib/
+    # below is default way
+    build_from="bos"
+    download_from_bos
+    xhpc_prepare
+
+    cp -r $XRE_DIR_NAME/include/xpu/* xpu/include/xpu/
+    cp -r $XRE_DIR_NAME/so/libxpurt* xpu/lib/
+    cp -r $XCCL_DIR_NAME/include/* xpu/include/xpu/
+    cp -r $XCCL_DIR_NAME/so/* xpu/lib/
 fi
-
-cp -r $XRE_DIR_NAME/include/xpu/* xpu/include/xpu/
-cp -r $XRE_DIR_NAME/so/libxpurt* xpu/lib/
-cp -r $XCCL_DIR_NAME/include/* xpu/include/xpu/
-cp -r $XCCL_DIR_NAME/so/* xpu/lib/

From 8de4febee77c74f5be3a549780ba64738fd1f902 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Fri, 23 Feb 2024 18:45:19 +0800
Subject: [PATCH 42/82] fix (#61934)

---
 python/setup_cinn.py.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
index f2fcd3029d231..cbdef191c4cd3 100644
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -185,7 +185,7 @@ if platform.system() == 'Linux' and platform.machine() == 'x86_64':
     paddle_cuda_install_requirements = os.getenv(
             "PADDLE_CUDA_INSTALL_REQUIREMENTS", None
         )
-    if paddle_cuda_install_requirements is not None:
+    if paddle_cuda_install_requirements == "ON":
         PADDLE_CUDA_INSTALL_REQUIREMENTS = {
             "V11": (
                 "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "

From a68d9ccbd68492e0070c890ecb2a3eaee5ba36b9 Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Fri, 23 Feb 2024 19:51:13 +0800
Subject: [PATCH 43/82] [PIR]Fix infershape error when infershape function has
 MetaConfig (#62010)

---
 .../pir/dialect/op_generator/op_build_gen.py  | 110 ++++++++++++++++--
 paddle/phi/infermeta/unary.cc                 |   1 +
 2 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index 7b079605a2460..3365421990f1b 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -14,20 +14,110 @@
 
 # generator build function
 _INFERMETA_NEED_META_CONFIG = {
-    'SplitInferMeta',
-    'SumInferMeta',
-    'SplitWithNumInferMeta',
+    # binary.h
+    'AllValueCompareInferMeta',
+    'KLDivInferMeta',
+    'ArrayWriteInferMeta',
+    'ArrayReadInferMeta',
+    'BCELossInferMeta',
+    'BinomialInferMeta',
+    'ConvInferMeta',
+    'Conv3DInferMeta',
+    'ConvTransposeInferMeta',
+    'Conv2dTransposeInferMeta',
+    'CrossEntropyWithSoftmaxInferMeta',
+    'CSoftmaxWithCrossEntropyInferMeta',
+    'DepthwiseConvInferMeta',
+    'DistributeFpnProposalsInferMeta',
+    'ElementwiseRawInferMeta',
+    'GridSampleBaseInferMeta',
+    'HuberLossInferMeta',
+    'IndexSampleInferMeta',
+    'LogLossInferMeta',
+    'MarginCrossEntropyInferMeta',
+    'MatrixNMSInferMeta',
+    'PReluInferMeta',
+    'SegmentPoolInferMeta',
+    'YoloBoxInferMeta',
+    'ValueCompareInferMeta',
+    'UnpoolInferMeta',
+    'Unpool3dInferMeta',
+    # fusion.h
+    'FusedAttentionInferMeta',
+    'BNActXPUInferMeta',
+    'FusedFCElementwiseLayerNormInferMeta',
+    'FusedConv2dAddActInferMeta',
+    # multiary.h
+    'AddNTensorArrayInferMeta',
+    'AddNInferMeta',
+    'AucInferMeta',
+    'BatchNormInferMeta',
+    'BatchNormInferInferMeta',
+    'BilinearInferMeta',
+    'CoalesceTensorInferMeta',
+    'CheckMemoryContinueInferMeta',
     'ConcatInferMeta',
+    'DeformableConvInferMeta',
+    'FusedBiasActInferMeta',
+    'InterpolateInferMeta',
+    'NceInferMeta',
+    'SigmoidCrossEntropyWithLogitsInferMeta',
+    'StackInferMeta',
+    'FusedConvInferMeta',
+    # nullary.h
+    'EyeInferMeta',
+    # ternary.h
+    'AccuracyInferMeta',
+    'BoxCoderInferMeta',
+    'InstanceNormInferMeta',
+    'LayerNormInferMeta',
+    'MatchMatrixTensorInferMeta',
+    'MultiClassNMSInferMeta',
+    'NllLossRawInferMeta',
+    'RoiAlignInferMeta',
+    'SpectralNormInferMeta',
+    'ViterbiDecodeInferMeta',
+    'TdmSamplerInferMeta',
+    # unary.h
+    'GetSplitAxisValue',
+    'ArgMinMaxInferMeta',
+    'ArrayToTensorInferMeta',
+    'CropInferMeta',
+    'EigvalsInferMeta',
+    'FractionalMaxPoolInferMeta',
+    'MaxPoolWithIndexInferMeta',
+    'MaxPoolV2InferMeta',
+    'MultinomialInferMeta',
+    'OverlapAddInferMeta',
+    'PadInferMeta',
+    'Pad3dInferMeta',
+    'PoolInferMeta',
+    'Pool2DInferMeta',
+    'ReduceIntArrayAxisInferMetaBase',
     'ReduceIntArrayAxisInferMeta',
+    'ReshapeInferMeta',
     'ReshapeWithXShapeInferMeta',
+    'ReverseInferMeta',
+    'ReverseArrayInferMeta',
+    'ShardIndexInferMeta',
+    'SliceArrayInferMeta',
+    'SliceArrayDenseInferMeta',
     'SliceRawInferMeta',
-    'StackInferMeta',
-    'Conv2dTransposeInferMeta',
-    'FusedConv2dAddActInferMeta',
-    'InterpolateInferMeta',
-    'DeformableConvInferMeta',
-    'MatrixNMSInferMeta',
-    'IndexSampleInferMeta',
+    'SplitInferMeta',
+    'SplitWithNumInferMeta',
+    'SqueezeInferMeta',
+    'SqueezeWithXShapeInferMeta',
+    'StridedSliceRawInferMeta',
+    'StridedSliceInferMeta',
+    'SumInferMeta',
+    'SumRawInferMeta',
+    'TemporalShiftInferMeta',
+    'TileInferMeta',
+    'TopKInferMeta',
+    'UnfoldInferMeta',
+    'UnsqueezeInferMeta',
+    'UnsqueezeWithXShapeInferMeta',
+    'ArrayPopInferMeta',
 }
 
 _PREPARE_DATA_WITH_VECTOR_INT64_MTTABLE_ATTRIBUTE = {'FrobeniusNormOp'}
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 3b47085eee9b1..42eaa2670a0b5 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -3595,6 +3595,7 @@ void ReshapeInferMeta(const MetaTensor& x,
   if (!config.is_runtime && shape.FromTensor()) {
     out->set_dims(common::make_ddim(shape_data));
     out->share_lod(x);
+    out->set_dtype(x.dtype());
     return;
   }
   InferMetaFromVecValue(x, shape_data, out);

From c3074d6061a4f5017363ed121ec0436d53a5fe83 Mon Sep 17 00:00:00 2001
From: diadestiny <44188454+diadestiny@users.noreply.github.com>
Date: Fri, 23 Feb 2024 21:22:52 +0800
Subject: [PATCH 44/82] [SOT][3.12] Support `END_FOR` opcode by skiping
 `END_FOR` in `FOR_ITER` in Python 3.12 (#62008)

---
 .../jit/sot/opcode_translator/executor/opcode_executor.py     | 2 ++
 test/sot/skip_files_py312                                     | 4 ----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index ccfae0a888f02..e9a985e5b728c 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -2100,6 +2100,8 @@ def FOR_ITER(self, instr):
 
             self._inline_call_for_loop(iterator, instr)
             self._lasti = self.indexof(instr.jump_to)
+            next_instr = self._instructions[self._lasti]
+            self._lasti += int(next_instr.opname == 'END_FOR')
         except BreakGraphError as e:
             log(3, f"[BreakGraph] FOR_ITER sim for loop failed for: {e}\n")
             if backup_iter_idx:
diff --git a/test/sot/skip_files_py312 b/test/sot/skip_files_py312
index d79956533e2d3..815f3a9e68b49 100644
--- a/test/sot/skip_files_py312
+++ b/test/sot/skip_files_py312
@@ -3,14 +3,10 @@
 ./test_12_for_loop.py
 ./test_14_operators.py
 ./test_15_slice.py
-./test_17_paddle_layer.py
 ./test_21_global.py
 ./test_analysis_inputs.py
 ./test_break_graph.py
-./test_builtin_map.py
-./test_builtin_range.py
 ./test_builtin_zip.py
-./test_enumerate.py
 ./test_guard_user_defined_fn.py
 ./test_inplace_api.py
 ./test_min_graph_size.py

From 4f42c2b6cb26a7f2d6a0f7502f870578b48ddf44 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Fri, 23 Feb 2024 22:06:57 +0800
Subject: [PATCH 45/82] [Dy2St] Remove internal API `to_variable` (#61952)

---
 python/paddle/base/dygraph/__init__.py        |   1 -
 python/paddle/base/dygraph/base.py            | 121 ------------------
 python/paddle/framework/__init__.py           |   2 +-
 ...ransformer.py => name_load_transformer.py} |  71 ----------
 .../jit/dy2static/transformers/transform.py   |   7 +-
 test/dygraph_to_static/test_se_resnet.py      |   5 +-
 6 files changed, 8 insertions(+), 199 deletions(-)
 rename python/paddle/jit/dy2static/transformers/{basic_api_transformer.py => name_load_transformer.py} (64%)

diff --git a/python/paddle/base/dygraph/__init__.py b/python/paddle/base/dygraph/__init__.py
index fc77a5367c3fc..28a94ba061a0a 100644
--- a/python/paddle/base/dygraph/__init__.py
+++ b/python/paddle/base/dygraph/__init__.py
@@ -24,7 +24,6 @@
     guard,
     no_grad,
     no_grad_,
-    to_variable,
 )
 from .tracer import Tracer  # noqa: F401
 
diff --git a/python/paddle/base/dygraph/base.py b/python/paddle/base/dygraph/base.py
index 4f233cfe4d671..27b4e4ae675cb 100644
--- a/python/paddle/base/dygraph/base.py
+++ b/python/paddle/base/dygraph/base.py
@@ -16,14 +16,12 @@
 import warnings
 
 import decorator
-import numpy as np
 
 import paddle
 from paddle.base import core, framework
 from paddle.base.framework import global_var
 from paddle.base.multiprocess_utils import CleanupFuncRegistrar
 
-from ..data_feeder import convert_dtype
 from ..framework import _get_paddle_place
 from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
 from .tracer import Tracer
@@ -837,122 +835,3 @@ def check_in_out(in_out_list, name):
         allow_unused,
         no_grad_vars,
     )
-
-
-@framework.dygraph_only
-def to_variable(value, name=None, zero_copy=None, dtype=None):
-    r"""
-    :api_attr: imperative
-
-    The API will create a ``Variable`` object from
-    tuple, list, numpy\.ndarray or Variable object.
-
-    Parameters:
-        value(tuple|list|ndarray|Variable|Tensor): Initial data.
-            Can be a list, tuple, NumPy ndarray, Variable, Tensor.
-            The shape can be multi-dimensional. The data type is one of
-            numpy\.{float16, float32, float64, int16, int32, int64,
-            uint8, uint16, complex64, complex128}.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name` .
-        zero_copy(bool, optional): Whether to share memory with the input numpy
-            array. This parameter only works with CPUPlace and will be set to
-            True when it is None. Default: None. (Note: zero_copy is discarded temporally for some reason.)
-        dtype(str, optional): The desired data type of returned ``Variable`` .
-            Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' ,
-            'int32' , 'int64' , 'uint8' . Default: None.
-
-    Returns:
-        Variable : If ``value`` is a tuple/list/numpy\.ndarray object,
-            return ``Tensor`` created from the corresponding numpy\.ndarray object, which has
-            same data type and shape with ``value``.
-
-
-    Examples:
-
-        .. code-block:: python
-
-            >>> import numpy as np
-            >>> import paddle.base as base
-
-            >>> with base.dygraph.guard(base.CPUPlace()):
-            ...     x = np.ones([2, 2], np.float32)
-            ...     y = base.dygraph.to_variable(x, zero_copy=False)
-            ...     x[0][0] = -1
-            ...     print(y[0][0].numpy())
-            ...     y = base.dygraph.to_variable(x)
-            ...     x[0][0] = 0
-            ...     print(y[0][0].numpy())
-            ...     c = np.array([2+1j, 2])
-            ...     z = base.dygraph.to_variable(c)
-            ...     print(z.numpy())
-            ...     print(z.dtype)
-            ...
-            ...     y = base.dygraph.to_variable([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
-            ...     print(y.shape)
-            ...
-            ...     y = base.dygraph.to_variable(((0.1, 1.2), (2.2, 3.1), (4.9, 5.2)), dtype='int32')
-            ...     print(y.shape)
-            1
-            -1
-            [2.+1.j, 2.+0.j]
-            paddle.complex128
-            [3, 2]
-            [3, 2]
-    """
-    support_type = (
-        list,
-        tuple,
-        np.ndarray,
-        core.eager.Tensor,
-        framework.Variable,
-        core.Tensor,
-        core.LoDTensor,
-    )
-    if not isinstance(value, support_type):
-        raise TypeError(
-            "The type of 'value' in base.dygraph.to_variable must be {}, but received {}.".format(
-                support_type, type(value)
-            )
-        )
-    if isinstance(value, (core.eager.Tensor, framework.Variable)):
-        return value
-    elif isinstance(value, (core.Tensor, core.LoDTensor)):
-        return core.eager.Tensor(value)
-    else:
-        if isinstance(
-            framework._current_expected_place(), framework.core.CPUPlace
-        ):
-            # TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace.
-            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not satisfy.
-            # Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
-            # (2): when used in flask framework, it may result in hang.
-            # Details: https://github.com/PaddlePaddle/Paddle/issues/26635
-            # So, we temporally disable the zero_copy strategy.
-            if zero_copy is True:
-                warnings.warn(
-                    "Currently, zero_copy is not supported, and it will be discarded."
-                )
-                zero_copy = False
-        else:
-            assert (
-                not zero_copy
-            ), "zero_copy mode can only be used with CPUPlace"
-
-        if not isinstance(value, np.ndarray):
-            value = np.array(value)
-
-        if dtype is not None:
-            dtype = convert_dtype(dtype)
-            if value.dtype != dtype:
-                value = value.astype(dtype)
-
-        return core.eager.Tensor(
-            value,
-            framework._current_expected_place(),
-            False,
-            zero_copy,
-            name if name else None,
-            True,
-        )
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 4a6019705b0da..445ad10cd1f91 100755
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -23,7 +23,7 @@
     IPUPlace,
     XPUPlace,
 )
-from ..base.dygraph import base, to_variable  # noqa: F401
+from ..base.dygraph import base  # noqa: F401
 from ..base.dygraph.base import (  # noqa: F401
     disable_dygraph as enable_static,
     enable_dygraph as disable_static,
diff --git a/python/paddle/jit/dy2static/transformers/basic_api_transformer.py b/python/paddle/jit/dy2static/transformers/name_load_transformer.py
similarity index 64%
rename from python/paddle/jit/dy2static/transformers/basic_api_transformer.py
rename to python/paddle/jit/dy2static/transformers/name_load_transformer.py
index f3a2bdc1ab995..8e24cec812870 100644
--- a/python/paddle/jit/dy2static/transformers/basic_api_transformer.py
+++ b/python/paddle/jit/dy2static/transformers/name_load_transformer.py
@@ -21,46 +21,6 @@
 __all__ = []
 
 
-class BasicApiTransformer(BaseTransformer):
-    """
-    Class to transform basic API from dygraph to static graph.
-    """
-
-    def __init__(self, root):
-        self.root = root
-
-    def transform(self):
-        to_tensor_transformer = ToTensorTransformer(self.root)
-        to_tensor_transformer.transform()
-        attribute_transformer = AttributeJstTransformer(self.root)
-        attribute_transformer.transform()
-        self.visit(self.root)
-        return self.root
-
-
-class ToTensorTransformer(BaseTransformer):
-    """
-    Class to transform paddle.to_tensor and paddle.to_variable to paddle.assign
-    """
-
-    def __init__(self, node):
-        assert isinstance(
-            node, gast.AST
-        ), "Input non-gast.AST node for the initialization of ToTensorTransformer."
-        self.root = node
-
-    def transform(self):
-        self.visit(self.root)
-        return self.root
-
-    def visit_Call(self, node):
-        assert isinstance(node, gast.Call)
-        if is_to_variable(node):
-            node = to_assign_node(node)
-        self.generic_visit(node)
-        return node
-
-
 class NameloadJstTransformer(BaseTransformer):
     """
     change name and attribute load to __jst.Ld(name) pattern.
@@ -168,34 +128,3 @@ def visit_Attribute(self, node):
             )
         self.generic_visit(node)
         return node
-
-
-def is_to_variable(node):
-    assert isinstance(node, gast.Call)
-    api_name = ast_to_source_code(node.func).strip()
-
-    return api_name.split(".")[-1] == "to_variable"
-
-
-def to_assign_node(node):
-    # Transform dygraph api `base.dygraph.to_variable` alias `paddle.to_tensor` to static api `paddle.assign`.
-    # NOTE:
-    #   1. Api `to_variable` supports data type {float16, float32, float64, int16, int32, int64, uint8, uint16},
-    #   but api `assign` only supports {float32, float64, int32, int64, bool};
-    #   2. If the input of api `assign` is numpy.ndarray, its size cannot be greater than 1024 * 1024.
-
-    assert isinstance(node, gast.Call)
-    assign_api = gast.parse('paddle.assign').body[0].value
-    node.func = assign_api
-
-    if node.args:
-        node.args = [node.args[0]]
-        node.keywords = []
-    else:
-        for idx, kw in enumerate(node.keywords):
-            if kw.arg == 'value' or kw.arg == 'data':
-                node.keywords[idx].arg = 'x'
-                node.keywords = [node.keywords[idx]]
-                node.args = []
-                break
-    return node
diff --git a/python/paddle/jit/dy2static/transformers/transform.py b/python/paddle/jit/dy2static/transformers/transform.py
index b07f416a1af6e..9ae5edb3fb68e 100644
--- a/python/paddle/jit/dy2static/transformers/transform.py
+++ b/python/paddle/jit/dy2static/transformers/transform.py
@@ -23,7 +23,6 @@
 from ..utils import ast_to_source_code
 from .assert_transformer import AssertTransformer
 from .base import BaseTransformer
-from .basic_api_transformer import BasicApiTransformer, NameloadJstTransformer
 from .break_continue_transformer import (
     BreakContinueTransformer,
     BreakTransformOptimizer,
@@ -36,6 +35,10 @@
 from .ifelse_transformer import IfElseTransformer
 from .logical_transformer import LogicalTransformer
 from .loop_transformer import LoopTransformer
+from .name_load_transformer import (
+    AttributeJstTransformer,
+    NameloadJstTransformer,
+)
 from .return_transformer import ReturnTransformer
 from .tensor_shape_transformer import TensorShapeTransformer
 from .tensorhook_transformer import RegisterHookTransformer
@@ -91,7 +94,7 @@ def transfer_from_node_type(self, node):
         transformers = [
             RegisterHookTransformer,
             EarlyReturnTransformer,
-            BasicApiTransformer,  # Basic Api
+            AttributeJstTransformer,  # Tensor.size -> Tensor.size(), it's unnecessary in PIR mode
             TensorShapeTransformer,  # Tensor.shape -> paddle.shape(Tensor)
             BreakContinueTransformer,  # break/continue in loops
             ReturnTransformer,  # return in functions
diff --git a/test/dygraph_to_static/test_se_resnet.py b/test/dygraph_to_static/test_se_resnet.py
index a9d11b2959994..113dde8dde3d3 100644
--- a/test/dygraph_to_static/test_se_resnet.py
+++ b/test/dygraph_to_static/test_se_resnet.py
@@ -29,7 +29,6 @@
 
 import paddle
 from paddle import base
-from paddle.base.dygraph.base import to_variable
 from paddle.jit.api import to_static
 from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn import BatchNorm, Linear
@@ -403,8 +402,8 @@ def train(self, train_reader, to_static):
                         .reshape(BATCH_SIZE, 1)
                     )
 
-                    img = to_variable(dy_x_data)
-                    label = to_variable(y_data)
+                    img = paddle.to_tensor(dy_x_data)
+                    label = paddle.to_tensor(y_data)
                     label.stop_gradient = True
 
                     pred, avg_loss, acc_top1, acc_top5 = se_resnext(img, label)

From dc6071aa709b2f5b339e3f39a17f9d5b72c8d8d6 Mon Sep 17 00:00:00 2001
From: winter-wang <78149749+winter-wang@users.noreply.github.com>
Date: Sat, 24 Feb 2024 00:01:42 +0800
Subject: [PATCH 46/82] [PIR] fix keyword argument bug in executor. (#62006)

---
 paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index f82ec0cbcdf1d..469ab96a3c0cb 100644
--- a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -2541,8 +2541,8 @@ void ProcessBlock(
     auto new_arg = new_block->AddKwarg(keyword, arg.type());
     (*map_value_pair)[arg] = new_arg;
     if (auto dense_tensor_type = arg.type().dyn_cast<DenseTensorType>()) {
-      new_arg.set_type(AllocatedDenseTensorType::get(
-          ctx, phi::CPUPlace(), dense_tensor_type));
+      new_arg.set_type(
+          AllocatedDenseTensorType::get(ctx, phi::Place(), dense_tensor_type));
     }
   }
   if (platform::is_gpu_place(place)) {

From 9cc5bafa0dc8e032ce4adc5c6ee4f4547fd8883e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Sat, 24 Feb 2024 18:51:41 +0800
Subject: [PATCH 47/82] Add some empty cmakelist.txt to avoid upcoming
 conflicts (#62033)

* add cmakelist

* Update CMakeLists.txt

* Update CMakeLists.txt
---
 test/cpp/fluid/CMakeLists.txt                        | 1 +
 test/cpp/fluid/platform/CMakeLists.txt               | 2 ++
 test/cpp/fluid/platform/device/CMakeLists.txt        | 1 +
 test/cpp/fluid/platform/device/custom/CMakeLists.txt | 1 +
 test/cpp/fluid/platform/profiler/CMakeLists.txt      | 1 +
 5 files changed, 6 insertions(+)
 create mode 100644 test/cpp/fluid/platform/CMakeLists.txt
 create mode 100644 test/cpp/fluid/platform/device/CMakeLists.txt
 create mode 100644 test/cpp/fluid/platform/device/custom/CMakeLists.txt
 create mode 100644 test/cpp/fluid/platform/profiler/CMakeLists.txt

diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 6e006b16ad6ef..f49eefb4354d0 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(memory)
 add_subdirectory(benchmark)
 add_subdirectory(framework)
+add_subdirectory(platform)
 
 if(WITH_CINN)
   add_subdirectory(cinn)
diff --git a/test/cpp/fluid/platform/CMakeLists.txt b/test/cpp/fluid/platform/CMakeLists.txt
new file mode 100644
index 0000000000000..d57ed923b4a63
--- /dev/null
+++ b/test/cpp/fluid/platform/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(device)
+add_subdirectory(profiler)
diff --git a/test/cpp/fluid/platform/device/CMakeLists.txt b/test/cpp/fluid/platform/device/CMakeLists.txt
new file mode 100644
index 0000000000000..33311abc3d279
--- /dev/null
+++ b/test/cpp/fluid/platform/device/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(custom)
diff --git a/test/cpp/fluid/platform/device/custom/CMakeLists.txt b/test/cpp/fluid/platform/device/custom/CMakeLists.txt
new file mode 100644
index 0000000000000..0a95e9a292a4d
--- /dev/null
+++ b/test/cpp/fluid/platform/device/custom/CMakeLists.txt
@@ -0,0 +1 @@
+# Note(Liyulingyue): create an empty cmake file to avoid conflict
diff --git a/test/cpp/fluid/platform/profiler/CMakeLists.txt b/test/cpp/fluid/platform/profiler/CMakeLists.txt
new file mode 100644
index 0000000000000..0a95e9a292a4d
--- /dev/null
+++ b/test/cpp/fluid/platform/profiler/CMakeLists.txt
@@ -0,0 +1 @@
+# Note(Liyulingyue): create an empty cmake file to avoid conflict

From 7efc5235b34fdbd2bd74d8e3294c43c54a45c22e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Sun, 25 Feb 2024 17:58:06 +0800
Subject: [PATCH 48/82] =?UTF-8?q?=E3=80=90paddle=5Ftest=20No.36=E3=80=91re?=
 =?UTF-8?q?place=20cc=5Ftest=20with=20paddle=5Ftest=20(#62036)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* mv floder

* cc2paddle
---
 paddle/fluid/platform/device/custom/CMakeLists.txt            | 4 ----
 test/cpp/fluid/platform/device/custom/CMakeLists.txt          | 4 +++-
 .../cpp}/fluid/platform/device/custom/custom_device_test.cc   | 0
 3 files changed, 3 insertions(+), 5 deletions(-)
 rename {paddle => test/cpp}/fluid/platform/device/custom/custom_device_test.cc (100%)

diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
index c01bead7b03e6..023efe02e08bf 100644
--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -3,8 +3,4 @@ if(WITH_CUSTOM_DEVICE)
     custom_device_resource_pool
     SRCS custom_device_resource_pool.cc
     DEPS phi common glog enforce monitor)
-  cc_test(
-    custom_device_test
-    SRCS custom_device_test.cc
-    DEPS phi common gradient_accumulator)
 endif()
diff --git a/test/cpp/fluid/platform/device/custom/CMakeLists.txt b/test/cpp/fluid/platform/device/custom/CMakeLists.txt
index 0a95e9a292a4d..87f4a0ec50251 100644
--- a/test/cpp/fluid/platform/device/custom/CMakeLists.txt
+++ b/test/cpp/fluid/platform/device/custom/CMakeLists.txt
@@ -1 +1,3 @@
-# Note(Liyulingyue): create an empty cmake file to avoid conflict
+if(WITH_CUSTOM_DEVICE)
+  paddle_test(custom_device_test SRCS custom_device_test.cc)
+endif()
diff --git a/paddle/fluid/platform/device/custom/custom_device_test.cc b/test/cpp/fluid/platform/device/custom/custom_device_test.cc
similarity index 100%
rename from paddle/fluid/platform/device/custom/custom_device_test.cc
rename to test/cpp/fluid/platform/device/custom/custom_device_test.cc

From e213188465e9d3b89ed29fedf98dbe7b846c9576 Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Sun, 25 Feb 2024 23:18:08 +0800
Subject: [PATCH 49/82] add overlap p2p (#61935)

---
 .../framework/distributed_strategy.proto      |   1 +
 .../fleet/meta_parallel/pipeline_parallel.py  | 359 ++++++++++++-----
 .../pp_utils/p2p_communication.py             | 360 +++++++++++++-----
 3 files changed, 516 insertions(+), 204 deletions(-)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 2042a313c41e6..58460fcf9064b 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -82,6 +82,7 @@ message PpConfig {
     optional bool sharding_comm_overlap = 4 [ default = false ];
     optional bool profiling = 5 [ default = false ];
     optional bool release_gradients = 6 [ default = false ];
+    optional bool overlap_p2p_comm = 7 [default = false];
 }
 
 message DygraphShardingConfig {
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 384d89b4d9c12..e5233c87a199b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -216,6 +216,12 @@ def __init__(self, layers, hcg, strategy):
             "sharding_configs"
         ].split_param
 
+        self._overlap_p2p_comm = self._strategy.hybrid_configs[
+            "pp_configs"
+        ].overlap_p2p_comm
+
+        self._batch_p2p_comm = not self._overlap_p2p_comm
+
         logger.info(
             f"dp_comm_overlap {self._dp_comm_overlap}; \
             sharding_comm_overlap {self._sharding_comm_overlap}; \
@@ -1229,12 +1235,21 @@ def _process_bwd_buffer(step_id, tensor):
         if not static_scheduler:
             self.input_tensors[0].append(
                 self._p2p_helper.recv_forward(
-                    self.is_pipeline_first_stage(), sync_recv=False
+                    self.is_pipeline_first_stage(),
+                    sync_recv=False,
+                    batch_p2p_comm=self._batch_p2p_comm,
                 )
             )
 
+        fwd_wait_handles = None
+        bwd_wait_handles = None
+
         # run startup steps
         for micro_step in range(startup_steps):
+            if fwd_wait_handles is not None:
+                for req in fwd_wait_handles:
+                    req.wait()
+
             if static_scheduler:
                 virtual_pp_rank = self._get_virtual_pp_rank(
                     micro_step, forward=True
@@ -1270,39 +1285,77 @@ def _process_bwd_buffer(step_id, tensor):
             if self.is_pipeline_last_stage(ignore_virtual=True):
                 output_tensor = _process_fwd_buffer(micro_step, output_tensor)
 
-            # prepare for the first steady step
-            if (
-                micro_step == (startup_steps - 1)
-                and (not forward_only)
-                and steady_steps
-            ):
-                input_tensor_grad = None
-                recv_next = True
-                if self.is_pipeline_last_stage(ignore_virtual=True):
-                    recv_next = False
+            if not self._overlap_p2p_comm:
+                # prepare for the first steady step
+                if (
+                    micro_step == (startup_steps - 1)
+                    and (not forward_only)
+                    and steady_steps
+                ):
+                    input_tensor_grad = None
+                    recv_next = True
+                    if self.is_pipeline_last_stage(ignore_virtual=True):
+                        recv_next = False
 
-                # the last startup step needs on four direction comm to set up for steady 1f1b
+                    # the last startup step needs on four direction comm to set up for steady 1f1b
+                    (
+                        input_tensor,
+                        output_tensor_grad,
+                    ) = self._p2p_helper.send_forward_backward_recv_forward_backward(
+                        output_tensor,
+                        input_tensor_grad,
+                        recv_prev=recv_prev,
+                        recv_next=recv_next,
+                        batch_p2p_comm=self._batch_p2p_comm,
+                    )
+                    # output_tensor_grad is not none if recv_next
+                    # append output_tensor_grad no matter none or not
+                    self.output_tensor_grads[self.num_model_chunks - 1].append(
+                        output_tensor_grad
+                    )
+                else:
+                    input_tensor = self._p2p_helper.send_forward_recv_forward(
+                        output_tensor,
+                        recv_prev=recv_prev,
+                        batch_p2p_comm=self._batch_p2p_comm,
+                    )
+                # append input_tensor no matter none or not
+                self.input_tensors[next_virtual_pp_rank].append(input_tensor)
+            else:
                 (
                     input_tensor,
-                    output_tensor_grad,
-                ) = self._p2p_helper.send_forward_backward_recv_forward_backward(
+                    fwd_wait_handles,
+                ) = self._p2p_helper.send_forward_recv_forward(
                     output_tensor,
-                    input_tensor_grad,
                     recv_prev=recv_prev,
-                    recv_next=recv_next,
+                    batch_p2p_comm=self._batch_p2p_comm,
+                    overlap_p2p_comm=True,
                 )
-                # output_tensor_grad is not none if recv_next
-                # append output_tensor_grad no matter none or not
-                self.output_tensor_grads[self.num_model_chunks - 1].append(
-                    output_tensor_grad
-                )
-            else:
-                input_tensor = self._p2p_helper.send_forward_recv_forward(
-                    output_tensor, recv_prev=recv_prev
-                )
-            # append input_tensor no matter none or not
-            self.input_tensors[next_virtual_pp_rank].append(input_tensor)
+                if (
+                    micro_step == (startup_steps - 1)
+                    and (not forward_only)
+                    and steady_steps
+                ):
+                    input_tensor_grad = None
+                    recv_next = True
+                    if self.is_pipeline_last_stage(ignore_virtual=True):
+                        recv_next = False
 
+                    (
+                        output_tensor_grad,
+                        bwd_wait_handles,
+                    ) = self._p2p_helper.send_backward_recv_backward(
+                        input_tensor_grad,
+                        recv_next=recv_next,
+                        batch_p2p_comm=self._batch_p2p_comm,
+                        overlap_p2p_comm=True,
+                    )
+                    self.output_tensor_grads[self.num_model_chunks - 1].append(
+                        output_tensor_grad
+                    )
+
+                # append input_tensor no matter none or not
+                self.input_tensors[next_virtual_pp_rank].append(input_tensor)
             self._release_output(output_tensor)
 
         # run 1f1b steady steps
@@ -1339,85 +1392,186 @@ def _process_bwd_buffer(step_id, tensor):
                 continue
             # forward
             forward_micro_step_id = micro_step + startup_steps
-            self._record_stamp("F", forward_micro_step_id, '"B"', forward=True)
-            output_tensor = self._forward_step_helper(
-                micro_dataset, forward_micro_step_id
-            )
-            self._record_stamp("F", forward_micro_step_id, '"E"', forward=True)
 
-            # backward
-            backward_micro_step_id = micro_step
-            self._record_stamp(
-                "B", backward_micro_step_id, '"B"', forward=False
-            )
-            input_tensor_grad = self._backward_step_helper(
-                backward_micro_step_id
-            )
-            self._record_stamp(
-                "B", backward_micro_step_id, '"E"', forward=False
-            )
+            if self._overlap_p2p_comm:
+                if fwd_wait_handles is not None:
+                    for req in fwd_wait_handles:
+                        req.wait()
 
-            # four directions comm
-            # send output tensor to downstream
-            # send input tensor grad to upstream
-            # recv input tensor from upstream
-            # recv output tensor grad from downstream
+                self._release_output(output_tensor)
+                output_tensor = self._forward_step_helper(
+                    micro_dataset, forward_micro_step_id
+                )
 
-            # last stage doesn't send rst to downstream
-            forward_virtual_pp_rank = self._get_virtual_pp_rank(
-                forward_micro_step_id, forward=True
-            )
-            self.set_virtual_pipeline_rank(forward_virtual_pp_rank)
-            if self.is_pipeline_last_stage(ignore_virtual=True):
-                output_tensor = _process_fwd_buffer(
-                    forward_micro_step_id, output_tensor
+                forward_virtual_pp_rank = self._get_virtual_pp_rank(
+                    forward_micro_step_id, forward=True
                 )
+                self.set_virtual_pipeline_rank(forward_virtual_pp_rank)
+                if self.is_pipeline_last_stage(ignore_virtual=True):
+                    output_tensor = _process_fwd_buffer(
+                        forward_micro_step_id, output_tensor
+                    )
 
-            # first stage doesn't send grad to upstream
-            backward_virtual_pp_rank = self._get_virtual_pp_rank(
-                backward_micro_step_id, forward=False
-            )
-            self.set_virtual_pipeline_rank(backward_virtual_pp_rank)
-            if self.is_pipeline_first_stage(ignore_virtual=True):
-                input_tensor_grad = _process_bwd_buffer(
-                    backward_micro_step_id, input_tensor_grad
+                # determine whether to recv input tensor from upstream
+                recv_prev = True
+                if self.is_pipeline_first_stage(ignore_virtual=True):
+                    next_forward_virtual_pp_rank = self._get_virtual_pp_rank(
+                        forward_micro_step_id + 1, forward=True
+                    )
+                    if next_forward_virtual_pp_rank == 0:
+                        # next chunk is the first chunk, not need to pre recv an input tensor
+                        recv_prev = False
+                else:
+                    next_forward_virtual_pp_rank = self._get_virtual_pp_rank(
+                        forward_micro_step_id + 1, forward=True
+                    )
+
+                # last iteration doesn't need recv from upstream
+                if micro_step == (steady_steps - 1):
+                    recv_prev = False
+
+                # Send activation tensor to the next stage and receive activation tensor from the
+                # previous stage
+                (
+                    input_tensor,
+                    fwd_wait_handles,
+                ) = self._p2p_helper.send_forward_recv_forward(
+                    output_tensor,
+                    recv_prev=recv_prev,
+                    batch_p2p_comm=self._batch_p2p_comm,
+                    overlap_p2p_comm=True,
                 )
 
-            # determine whether to recv input tensor from upstream
-            recv_prev = True
-            next_forward_virtual_pp_rank = self._get_virtual_pp_rank(
-                forward_micro_step_id + 1, forward=True
-            )
-            if self.is_pipeline_first_stage(ignore_virtual=True) and (
-                next_forward_virtual_pp_rank == 0
-            ):
-                # first pp stage and first virtual stage
-                recv_prev = False
+                if bwd_wait_handles is not None:
+                    for req in bwd_wait_handles:
+                        req.wait()
 
-            # last iteration doesn't need recv from upstream
-            if micro_step == (steady_steps - 1):
-                recv_prev = False
+                # backward pass
+                backward_micro_step_id = micro_step
+                input_tensor_grad = self._backward_step_helper(
+                    backward_micro_step_id
+                )
 
-            # determine whether to recv grad from downstream
-            recv_next = True
-            next_backward_virtual_pp_rank = self._get_virtual_pp_rank(
-                backward_micro_step_id + 1, forward=False
-            )
-            if self.is_pipeline_last_stage(ignore_virtual=True) and (
-                next_backward_virtual_pp_rank == (self.num_model_chunks - 1)
-            ):
-                # last pp stage and last virtual stage
-                recv_next = False
-
-            (
-                input_tensor,
-                output_tensor_grad,
-            ) = self._p2p_helper.send_forward_backward_recv_forward_backward(
-                output_tensor,
-                input_tensor_grad,
-                recv_prev=recv_prev,
-                recv_next=recv_next,
-            )
+                # first stage doesn't send grad to upstream
+                backward_virtual_pp_rank = self._get_virtual_pp_rank(
+                    backward_micro_step_id, forward=False
+                )
+                self.set_virtual_pipeline_rank(backward_virtual_pp_rank)
+                if self.is_pipeline_first_stage(ignore_virtual=True):
+                    input_tensor_grad = _process_bwd_buffer(
+                        backward_micro_step_id, input_tensor_grad
+                    )
+
+                recv_next = True
+                if self.is_pipeline_last_stage(ignore_virtual=True):
+                    next_backward_virtual_pp_rank = self._get_virtual_pp_rank(
+                        backward_micro_step_id + 1,
+                        forward=False,
+                    )
+                    if next_backward_virtual_pp_rank == (
+                        self.num_model_chunks - 1
+                    ):
+                        # next chunk is the last chunk, not need to pre recv an output tensor grad
+                        recv_next = False
+                else:
+                    next_backward_virtual_pp_rank = self._get_virtual_pp_rank(
+                        backward_micro_step_id + 1, forward=False
+                    )
+
+                (
+                    output_tensor_grad,
+                    bwd_wait_handles,
+                ) = self._p2p_helper.send_backward_recv_backward(
+                    input_tensor_grad,
+                    recv_next=recv_next,
+                    batch_p2p_comm=self._batch_p2p_comm,
+                    overlap_p2p_comm=True,
+                )
+            else:
+                self._record_stamp(
+                    "F", forward_micro_step_id, '"B"', forward=True
+                )
+                output_tensor = self._forward_step_helper(
+                    micro_dataset, forward_micro_step_id
+                )
+                self._record_stamp(
+                    "F", forward_micro_step_id, '"E"', forward=True
+                )
+
+                # backward
+                backward_micro_step_id = micro_step
+                self._record_stamp(
+                    "B", backward_micro_step_id, '"B"', forward=False
+                )
+                input_tensor_grad = self._backward_step_helper(
+                    backward_micro_step_id
+                )
+                self._record_stamp(
+                    "B", backward_micro_step_id, '"E"', forward=False
+                )
+
+                # four directions comm
+                # send output tensor to downstream
+                # send input tensor grad to upstream
+                # recv input tensor from upstream
+                # recv output tensor grad from downstream
+
+                # last stage doesn't send rst to downstream
+                forward_virtual_pp_rank = self._get_virtual_pp_rank(
+                    forward_micro_step_id, forward=True
+                )
+                self.set_virtual_pipeline_rank(forward_virtual_pp_rank)
+                if self.is_pipeline_last_stage(ignore_virtual=True):
+                    output_tensor = _process_fwd_buffer(
+                        forward_micro_step_id, output_tensor
+                    )
+
+                # first stage doesn't send grad to upstream
+                backward_virtual_pp_rank = self._get_virtual_pp_rank(
+                    backward_micro_step_id, forward=False
+                )
+                self.set_virtual_pipeline_rank(backward_virtual_pp_rank)
+                if self.is_pipeline_first_stage(ignore_virtual=True):
+                    input_tensor_grad = _process_bwd_buffer(
+                        backward_micro_step_id, input_tensor_grad
+                    )
+
+                # determine whether to recv input tensor from upstream
+                recv_prev = True
+                next_forward_virtual_pp_rank = self._get_virtual_pp_rank(
+                    forward_micro_step_id + 1, forward=True
+                )
+                if self.is_pipeline_first_stage(ignore_virtual=True) and (
+                    next_forward_virtual_pp_rank == 0
+                ):
+                    # first pp stage and first virtual stage
+                    recv_prev = False
+
+                # last iteration doesn't need recv from upstream
+                if micro_step == (steady_steps - 1):
+                    recv_prev = False
+
+                # determine whether to recv grad from downstream
+                recv_next = True
+                next_backward_virtual_pp_rank = self._get_virtual_pp_rank(
+                    backward_micro_step_id + 1, forward=False
+                )
+                if self.is_pipeline_last_stage(ignore_virtual=True) and (
+                    next_backward_virtual_pp_rank == (self.num_model_chunks - 1)
+                ):
+                    # last pp stage and last virtual stage
+                    recv_next = False
+
+                (
+                    input_tensor,
+                    output_tensor_grad,
+                ) = self._p2p_helper.send_forward_backward_recv_forward_backward(
+                    output_tensor,
+                    input_tensor_grad,
+                    recv_prev=recv_prev,
+                    recv_next=recv_next,
+                    batch_p2p_comm=self._batch_p2p_comm,
+                )
             # append input_tensor no matter none or not
             self.input_tensors[next_forward_virtual_pp_rank].append(
                 input_tensor
@@ -1434,10 +1588,15 @@ def _process_bwd_buffer(step_id, tensor):
 
         # remaining backward steps
         if not forward_only:
+            if self._overlap_p2p_comm and bwd_wait_handles is not None:
+                for wait_handles in bwd_wait_handles:
+                    wait_handles.wait()
+
             # no steady steps, which only occurs when accumulate_step == num_stage
             if not steady_steps:
                 output_tensor_grad = p2p.recv_backward(
-                    self.is_pipeline_last_stage()
+                    self.is_pipeline_last_stage(),
+                    batch_p2p_comm=self._batch_p2p_comm,
                 )
                 self.output_tensor_grads[self.num_model_chunks - 1].append(
                     output_tensor_grad
@@ -1482,7 +1641,9 @@ def _process_bwd_buffer(step_id, tensor):
                 # append output_tensor_grad no matter none or not
                 self.output_tensor_grads[next_backward_virtual_pp_rank].append(
                     self._p2p_helper.send_backward_recv_backward(
-                        input_tensor_grad, recv_next=recv_next
+                        input_tensor_grad,
+                        recv_next=recv_next,
+                        batch_p2p_comm=self._batch_p2p_comm,
                     )
                 )
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index 4566f89290fc0..6d470d541f66b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -292,91 +292,33 @@ def batch_send_recv_on_calc_stream(p2p_op_list):
             op(tensor, comm_group, peer, nranks, rank_id)
 
 
-def _process_p2p_tuple_or_tensor(
+def _batch_p2p_tuple_or_tensor(
     tensors, p2p_func, pp_rank, pp_group, mp_degree=1, mp_rank=0
 ):
-    ops = []
-    if isinstance(tensors, tuple):
-        for tensor in tensors:
-            op = P2PonCalcStream(
-                p2p_func, tensor, pp_rank, pp_group, mp_degree, mp_rank
-            )
-            ops.append(op)
-    else:
-        op = P2PonCalcStream(
-            p2p_func, tensors, pp_rank, pp_group, mp_degree, mp_rank
-        )
-        ops.append(op)
+    if not isinstance(tensors, tuple):
+        tensors = (tensors,)
+    ops = [
+        P2PonCalcStream(p2p_func, tensor, pp_rank, pp_group, mp_degree, mp_rank)
+        for tensor in tensors
+    ]
     return ops
 
 
-def _p2p_helper(
-    tensor_send_next,
-    tensor_send_prev,
-    recv_prev,
-    recv_next,
-    sync_recv=True,
-    send_recv_meta=None,
+def _batched_p2p_ops(
+    tensor_send_prev, tensor_recv_prev, tensor_send_next, tensor_recv_next, hcg
 ):
-    global _hcg
-
-    tensor_recv_prev = None
-    tensor_recv_next = None
-
-    # send / recv message
-    assert send_recv_meta is not None, "send_recv_meta should not be None"
-    recv_shape_msg = send_recv_meta.recv_shape_message
-    recv_dtype_msg = send_recv_meta.recv_dtype_message
-    recv_stop_gradient = send_recv_meta.recv_stop_gradient
-
-    send_shape_msg = send_recv_meta.send_shape_message
-    send_dtype_msg = send_recv_meta.send_dtype_message
-
-    # model parallel message
-    mp_group = _hcg.get_model_parallel_group()
-    mp_degree = _hcg.get_model_parallel_world_size()
-    mp_rank = _hcg.get_model_parallel_rank()
-
-    if recv_prev:
-        if isinstance(recv_shape_msg, tuple):
-            tensor_recv_prev = []
-            for idx, shape in enumerate(recv_shape_msg):
-                tmp = paddle.empty(
-                    shape=shape, dtype=number_2_dtype(recv_dtype_msg[idx])
-                )
-                tmp.stop_gradient = recv_stop_gradient[idx]
-                tensor_recv_prev.append(tmp)
-            tensor_recv_prev = tuple(tensor_recv_prev)
-        else:
-            tensor_recv_prev = paddle.empty(
-                shape=recv_shape_msg, dtype=number_2_dtype(recv_dtype_msg)
-            )
-            tensor_recv_prev.stop_gradient = recv_stop_gradient
-
-    if recv_next:
-        if isinstance(send_shape_msg, tuple):
-            tensor_recv_next = []
-            for idx, shape in enumerate(send_shape_msg):
-                tensor_recv_next.append(
-                    paddle.empty(
-                        shape=shape, dtype=number_2_dtype(send_dtype_msg[idx])
-                    )
-                )
-            tensor_recv_next = tuple(tensor_recv_next)
-        else:
-            tensor_recv_next = paddle.empty(
-                shape=send_shape_msg, dtype=number_2_dtype(send_dtype_msg)
-            )
-
     ops = []
-    pipe_group = _hcg.get_pipe_parallel_group()
+    pipe_group = hcg.get_pipe_parallel_group()
+    mp_degree = hcg.get_model_parallel_world_size()
+    mp_rank = hcg.get_model_parallel_rank()
+    mp_group = hcg.get_model_parallel_group()
 
     # start to p2p communicate
     if not _sync_send:
         if tensor_send_prev is not None:
-            src_rank = _hcg._get_p2p_prev_rank()
+            src_rank = hcg._get_p2p_prev_rank()
             ops.extend(
-                _process_p2p_tuple_or_tensor(
+                _batch_p2p_tuple_or_tensor(
                     tensor_send_prev,
                     _send_on_calc_stream,
                     src_rank,
@@ -386,9 +328,9 @@ def _p2p_helper(
                 )
             )
         if tensor_recv_prev is not None:
-            dst_rank = _hcg._get_p2p_prev_rank()
+            dst_rank = hcg._get_p2p_prev_rank()
             ops.extend(
-                _process_p2p_tuple_or_tensor(
+                _batch_p2p_tuple_or_tensor(
                     tensor_recv_prev,
                     _recv_on_calc_stream,
                     dst_rank,
@@ -398,9 +340,9 @@ def _p2p_helper(
                 )
             )
         if tensor_send_next is not None:
-            src_rank = _hcg._get_p2p_next_rank()
+            src_rank = hcg._get_p2p_next_rank()
             ops.extend(
-                _process_p2p_tuple_or_tensor(
+                _batch_p2p_tuple_or_tensor(
                     tensor_send_next,
                     _send_on_calc_stream,
                     src_rank,
@@ -410,9 +352,9 @@ def _p2p_helper(
                 )
             )
         if tensor_recv_next is not None:
-            dst_rank = _hcg._get_p2p_next_rank()
+            dst_rank = hcg._get_p2p_next_rank()
             ops.extend(
-                _process_p2p_tuple_or_tensor(
+                _batch_p2p_tuple_or_tensor(
                     tensor_recv_next,
                     _recv_on_calc_stream,
                     dst_rank,
@@ -427,9 +369,9 @@ def _p2p_helper(
         # When using this order, the environment variable
         # 'PADDLE_P2P_SYNC_SEND' should be set True
         if tensor_recv_prev is not None:
-            dst_rank = _hcg._get_p2p_prev_rank()
+            dst_rank = hcg._get_p2p_prev_rank()
             ops.extend(
-                _process_p2p_tuple_or_tensor(
+                _batch_p2p_tuple_or_tensor(
                     tensor_recv_prev,
                     _recv_on_calc_stream,
                     dst_rank,
@@ -439,9 +381,9 @@ def _p2p_helper(
                 )
             )
         if tensor_send_next is not None:
-            src_rank = _hcg._get_p2p_next_rank()
+            src_rank = hcg._get_p2p_next_rank()
             ops.extend(
-                _process_p2p_tuple_or_tensor(
+                _batch_p2p_tuple_or_tensor(
                     tensor_send_next,
                     _send_on_calc_stream,
                     src_rank,
@@ -451,9 +393,9 @@ def _p2p_helper(
                 )
             )
         if tensor_recv_next is not None:
-            dst_rank = _hcg._get_p2p_next_rank()
+            dst_rank = hcg._get_p2p_next_rank()
             ops.extend(
-                _process_p2p_tuple_or_tensor(
+                _batch_p2p_tuple_or_tensor(
                     tensor_recv_next,
                     _recv_on_calc_stream,
                     dst_rank,
@@ -463,9 +405,9 @@ def _p2p_helper(
                 )
             )
         if tensor_send_prev is not None:
-            src_rank = _hcg._get_p2p_prev_rank()
+            src_rank = hcg._get_p2p_prev_rank()
             ops.extend(
-                _process_p2p_tuple_or_tensor(
+                _batch_p2p_tuple_or_tensor(
                     tensor_send_prev,
                     _send_on_calc_stream,
                     src_rank,
@@ -477,7 +419,6 @@ def _p2p_helper(
 
     if len(ops) > 0:
         batch_send_recv_on_calc_stream(ops)
-
         if distutils.util.strtobool(
             os.getenv('FLAGS_p2p_device_synchronize', '0')
         ):
@@ -506,7 +447,176 @@ def _p2p_helper(
             use_calc_stream=True,
         )
 
-    return tensor_recv_prev, tensor_recv_next
+
+def _p2p_ops_tuple_or_tensor(tensors, p2p_func, pp_rank, pp_group):
+    if not isinstance(tensors, tuple):
+        tensors = (tensors,)
+    reqs = []
+    for tensor in tensors:
+        reqs.append(p2p_func(tensor, pp_rank, pp_group))
+    return reqs
+
+
+def _p2p_ops(
+    tensor_send_prev, tensor_recv_prev, tensor_send_next, tensor_recv_next, hcg
+):
+    reqs = []
+    group = hcg.get_pipe_parallel_group()
+    if hcg.get_stage_id() % 2 == 0:
+        if tensor_send_next is not None:
+            reqs.extend(
+                _p2p_ops_tuple_or_tensor(
+                    tensor_send_next,
+                    paddle.distributed.isend,
+                    hcg._get_p2p_next_rank(),
+                    group,
+                )
+            )
+        if tensor_recv_prev is not None:
+            reqs.extend(
+                _p2p_ops_tuple_or_tensor(
+                    tensor_recv_prev,
+                    paddle.distributed.irecv,
+                    hcg._get_p2p_prev_rank(),
+                    group,
+                )
+            )
+
+        if tensor_send_prev is not None:
+            reqs.extend(
+                _p2p_ops_tuple_or_tensor(
+                    tensor_send_prev,
+                    paddle.distributed.isend,
+                    _hcg._get_p2p_prev_rank(),
+                    group,
+                )
+            )
+
+        if tensor_recv_next is not None:
+            reqs.extend(
+                _p2p_ops_tuple_or_tensor(
+                    tensor_recv_next,
+                    paddle.distributed.irecv,
+                    hcg._get_p2p_next_rank(),
+                    group,
+                )
+            )
+    else:
+        if tensor_recv_prev is not None:
+            reqs.extend(
+                _p2p_ops_tuple_or_tensor(
+                    tensor_recv_prev,
+                    paddle.distributed.irecv,
+                    hcg._get_p2p_prev_rank(),
+                    group,
+                )
+            )
+        if tensor_send_next is not None:
+            reqs.extend(
+                _p2p_ops_tuple_or_tensor(
+                    tensor_send_next,
+                    paddle.distributed.isend,
+                    hcg._get_p2p_next_rank(),
+                    group,
+                )
+            )
+        if tensor_recv_next is not None:
+            reqs.extend(
+                _p2p_ops_tuple_or_tensor(
+                    tensor_recv_next,
+                    paddle.distributed.irecv,
+                    hcg._get_p2p_next_rank(),
+                    group,
+                )
+            )
+        if tensor_send_prev is not None:
+            reqs.extend(
+                _p2p_ops_tuple_or_tensor(
+                    tensor_send_prev,
+                    paddle.distributed.isend,
+                    hcg._get_p2p_prev_rank(),
+                    group,
+                )
+            )
+    return reqs
+
+
+def _p2p_helper(
+    tensor_send_next,
+    tensor_send_prev,
+    recv_prev,
+    recv_next,
+    sync_recv=True,
+    send_recv_meta=None,
+    batch_p2p_comm=True,
+    wait_on_reqs=True,
+):
+    global _hcg
+
+    tensor_recv_prev = None
+    tensor_recv_next = None
+
+    # send / recv message
+    assert send_recv_meta is not None, "send_recv_meta should not be None"
+    recv_shape_msg = send_recv_meta.recv_shape_message
+    recv_dtype_msg = send_recv_meta.recv_dtype_message
+    recv_stop_gradient = send_recv_meta.recv_stop_gradient
+
+    send_shape_msg = send_recv_meta.send_shape_message
+    send_dtype_msg = send_recv_meta.send_dtype_message
+
+    # model parallel message
+    mp_group = _hcg.get_model_parallel_group()
+    mp_degree = _hcg.get_model_parallel_world_size()
+    mp_rank = _hcg.get_model_parallel_rank()
+
+    if recv_prev:
+        if isinstance(recv_shape_msg, tuple):
+            tensor_recv_prev = []
+            for idx, shape in enumerate(recv_shape_msg):
+                tmp = paddle.empty(
+                    shape=shape, dtype=number_2_dtype(recv_dtype_msg[idx])
+                )
+                tmp.stop_gradient = recv_stop_gradient[idx]
+                tensor_recv_prev.append(tmp)
+            tensor_recv_prev = tuple(tensor_recv_prev)
+        else:
+            tensor_recv_prev = paddle.empty(
+                shape=recv_shape_msg, dtype=number_2_dtype(recv_dtype_msg)
+            )
+            tensor_recv_prev.stop_gradient = recv_stop_gradient
+
+    if recv_next:
+        if isinstance(send_shape_msg, tuple):
+            tensor_recv_next = []
+            for idx, shape in enumerate(send_shape_msg):
+                tensor_recv_next.append(
+                    paddle.empty(
+                        shape=shape, dtype=number_2_dtype(send_dtype_msg[idx])
+                    )
+                )
+            tensor_recv_next = tuple(tensor_recv_next)
+        else:
+            tensor_recv_next = paddle.empty(
+                shape=send_shape_msg, dtype=number_2_dtype(send_dtype_msg)
+            )
+
+    p2p_func = _batched_p2p_ops if batch_p2p_comm else _p2p_ops
+    reqs = p2p_func(
+        tensor_send_prev,
+        tensor_recv_prev,
+        tensor_send_next,
+        tensor_recv_next,
+        _hcg,
+    )
+
+    # NOTE(shenliang03): batch_p2p_comm no need wait because of using calculate stream
+    if wait_on_reqs and not batch_p2p_comm and len(reqs) > 0:
+        for req in reqs:
+            req.wait()
+        reqs = None
+
+    return tensor_recv_prev, tensor_recv_next, reqs
 
 
 class P2pHelper:
@@ -527,7 +637,7 @@ def _recv_meta(self):
             self._send_recv_meta.recv_meta(_hcg.get_pipe_parallel_group())
             self._send_recv_meta.has_recv_meta = self._use_cache
 
-    def recv_forward(self, pp_first_stage, sync_recv=True):
+    def recv_forward(self, pp_first_stage, sync_recv=True, batch_p2p_comm=True):
         global _timers
         if _timers is not None:
             _timers("recv_forward").start()
@@ -536,38 +646,40 @@ def recv_forward(self, pp_first_stage, sync_recv=True):
         else:
             self._recv_meta()
 
-            input_tensor, _ = _p2p_helper(
+            input_tensor, _, _ = _p2p_helper(
                 tensor_send_next=None,
                 tensor_send_prev=None,
                 recv_prev=True,
                 recv_next=False,
                 sync_recv=sync_recv,
                 send_recv_meta=self._send_recv_meta,
+                batch_p2p_comm=batch_p2p_comm,
             )
         if _timers is not None:
             _timers("recv_forward").stop()
         return input_tensor
 
-    def recv_backward(self, pp_last_stage, sync_recv=True):
+    def recv_backward(self, pp_last_stage, sync_recv=True, batch_p2p_comm=True):
         global _timers
         if _timers is not None:
             _timers("recv_backward").start()
         if pp_last_stage:
             output_tensor_grad = None
         else:
-            _, output_tensor_grad = _p2p_helper(
+            _, output_tensor_grad, _ = _p2p_helper(
                 tensor_send_next=None,
                 tensor_send_prev=None,
                 recv_prev=False,
                 recv_next=True,
                 sync_recv=sync_recv,
                 send_recv_meta=self._send_recv_meta,
+                batch_p2p_comm=batch_p2p_comm,
             )
         if _timers is not None:
             _timers("recv_backward").stop()
         return output_tensor_grad
 
-    def send_forward(self, output_tensor, pp_last_stage):
+    def send_forward(self, output_tensor, pp_last_stage, batch_p2p_comm=True):
         global _timers
         if _timers is not None:
             _timers("send_forward").start()
@@ -580,11 +692,14 @@ def send_forward(self, output_tensor, pp_last_stage):
                 recv_prev=False,
                 recv_next=False,
                 send_recv_meta=self._send_recv_meta,
+                batch_p2p_comm=batch_p2p_comm,
             )
         if _timers is not None:
             _timers("send_forward").stop()
 
-    def send_backward(self, input_tensor_grad, pp_first_stage):
+    def send_backward(
+        self, input_tensor_grad, pp_first_stage, batch_p2p_comm=True
+    ):
         global _timers
         if _timers is not None:
             _timers("send_backward").start()
@@ -595,48 +710,60 @@ def send_backward(self, input_tensor_grad, pp_first_stage):
                 recv_prev=False,
                 recv_next=False,
                 send_recv_meta=self._send_recv_meta,
+                batch_p2p_comm=batch_p2p_comm,
             )
         if _timers is not None:
             _timers("send_backward").stop()
 
-    def send_forward_recv_backward(self, output_tensor, pp_last_stage):
+    def send_forward_recv_backward(
+        self, output_tensor, pp_last_stage, batch_p2p_comm=True
+    ):
         global _timers
         if _timers is not None:
             _timers("send_forward_recv_backward").start()
         if pp_last_stage:
             output_tensor_grad = None
         else:
-            _, output_tensor_grad = _p2p_helper(
+            _, output_tensor_grad, _ = _p2p_helper(
                 tensor_send_next=output_tensor,
                 tensor_send_prev=None,
                 recv_prev=False,
                 recv_next=True,
                 send_recv_meta=self._send_recv_meta,
+                batch_p2p_comm=batch_p2p_comm,
             )
         if _timers is not None:
             _timers("send_forward_recv_backward").stop()
         return output_tensor_grad
 
-    def send_backward_recv_forward(self, input_tensor_grad, pp_first_stage):
+    def send_backward_recv_forward(
+        self, input_tensor_grad, pp_first_stage, batch_p2p_comm=True
+    ):
         global _timers
         if _timers is not None:
             _timers("send_backward_recv_forward").start()
         if pp_first_stage:
             input_tensor = None
         else:
-            input_tensor, _ = _p2p_helper(
+            input_tensor, _, _ = _p2p_helper(
                 tensor_send_next=None,
                 tensor_send_prev=input_tensor_grad,
                 recv_prev=True,
                 recv_next=False,
                 send_recv_meta=self._send_recv_meta,
+                batch_p2p_comm=batch_p2p_comm,
             )
         if _timers is not None:
             _timers("send_backward_recv_forward").stop()
         return input_tensor
 
     def send_forward_backward_recv_forward_backward(
-        self, output_tensor, input_tensor_grad, recv_prev, recv_next
+        self,
+        output_tensor,
+        input_tensor_grad,
+        recv_prev,
+        recv_next,
+        batch_p2p_comm=True,
     ):
         # always have to send dtype info to downstream
         global _timers
@@ -648,19 +775,26 @@ def send_forward_backward_recv_forward_backward(
         if recv_prev:
             self._recv_meta()
 
-        input_tensor, output_tensor_grad = _p2p_helper(
+        input_tensor, output_tensor_grad, _ = _p2p_helper(
             tensor_send_next=output_tensor,
             tensor_send_prev=input_tensor_grad,
             recv_prev=recv_prev,
             recv_next=recv_next,
             sync_recv=False,
             send_recv_meta=self._send_recv_meta,
+            batch_p2p_comm=batch_p2p_comm,
         )
         if _timers is not None:
             _timers("send_forward_backward_recv_forward_backward").stop()
         return input_tensor, output_tensor_grad
 
-    def send_forward_recv_forward(self, output_tensor, recv_prev):
+    def send_forward_recv_forward(
+        self,
+        output_tensor,
+        recv_prev,
+        batch_p2p_comm=True,
+        overlap_p2p_comm=False,
+    ):
         # always have to send dtype info to downstream
         global _timers
         if _timers is not None:
@@ -672,32 +806,48 @@ def send_forward_recv_forward(self, output_tensor, recv_prev):
         if recv_prev:
             self._recv_meta()
 
-        input_tensor, _ = _p2p_helper(
+        input_tensor, _, wait_handles = _p2p_helper(
             tensor_send_next=output_tensor,
             tensor_send_prev=None,
             recv_prev=recv_prev,
             recv_next=False,
             sync_recv=False,
             send_recv_meta=self._send_recv_meta,
+            batch_p2p_comm=batch_p2p_comm,
+            wait_on_reqs=(not overlap_p2p_comm),
         )
         if _timers is not None:
             _timers("send_forward_recv_forward").stop()
+
+        if overlap_p2p_comm:
+            return input_tensor, wait_handles
         return input_tensor
 
-    def send_backward_recv_backward(self, input_tensor_grad, recv_next):
+    def send_backward_recv_backward(
+        self,
+        input_tensor_grad,
+        recv_next,
+        batch_p2p_comm=True,
+        overlap_p2p_comm=False,
+    ):
         global _timers
         if _timers is not None:
             _timers("send_backward_recv_backward").start()
-        _, output_tensor_grad = _p2p_helper(
+        _, output_tensor_grad, wait_handles = _p2p_helper(
             tensor_send_next=None,
             tensor_send_prev=input_tensor_grad,
             recv_prev=False,
             recv_next=recv_next,
             sync_recv=False,
             send_recv_meta=self._send_recv_meta,
+            batch_p2p_comm=batch_p2p_comm,
+            wait_on_reqs=(not overlap_p2p_comm),
         )
         if _timers is not None:
             _timers("send_backward_recv_backward").stop()
+
+        if overlap_p2p_comm:
+            return output_tensor_grad, wait_handles
         return output_tensor_grad
 
     def __repr__(self):

From 89902e0db4ee704f29815d40bc6d151c87abfa71 Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:10:53 +0800
Subject: [PATCH 50/82] [PIR][DynamicShape] Add InferSymbolicShape for if op
 (#61937)

* Add InferSymbolicShape for if op
---
 paddle/fluid/pir/transforms/shape_optimization_pass.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pir/transforms/shape_optimization_pass.cc b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
index 69377af0d30b5..80d56f75ae12b 100644
--- a/paddle/fluid/pir/transforms/shape_optimization_pass.cc
+++ b/paddle/fluid/pir/transforms/shape_optimization_pass.cc
@@ -128,8 +128,9 @@ void InferSymExprForBlock(const Block& block,
         op.dyn_cast<paddle::dialect::InferSymbolicShapeInterface>();
     if (infer_symbolic_shape_interface) {
       VLOG(vlog_level) << op.name() << " has InferSymbolicShapeInterface.";
-      PADDLE_ENFORCE(
+      PADDLE_ENFORCE_EQ(
           infer_symbolic_shape_interface.InferSymbolicShape(shape_analysis),
+          true,
           "InferSymbolicShape for %s failed.",
           op.name());
       if (op.num_results() > 0) {

From 60405397acf3e9b7de3d94556f06db7f97d7f19d Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:23:46 +0800
Subject: [PATCH 51/82] Fix GetErrorSumaryString (#61997)

---
 paddle/cinn/utils/error.h |  8 ++++----
 paddle/common/enforce.h   | 10 +++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/cinn/utils/error.h b/paddle/cinn/utils/error.h
index b0cd09f0c2b0e..7b5af324d7081 100644
--- a/paddle/cinn/utils/error.h
+++ b/paddle/cinn/utils/error.h
@@ -55,9 +55,9 @@ inline std::string demangle(std::string name) {
 inline std::string demangle(std::string name) { return name; }
 #endif
 
-static std::string GetErrorSumaryString(const std::string& what,
-                                        const char* file,
-                                        int line) {
+static std::string GetErrorSummaryString(const std::string& what,
+                                         const char* file,
+                                         int line) {
   std::ostringstream sout;
   sout << "\n----------------------\nError Message "
           "Summary:\n----------------------\n";
@@ -99,7 +99,7 @@ static std::string GetCurrentTraceBackString() {
 static std::string GetTraceBackString(const std::string& what,
                                       const char* file,
                                       int line) {
-  return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
+  return GetCurrentTraceBackString() + GetErrorSummaryString(what, file, line);
 }
 
 struct EnforceNotMet : public std::exception {
diff --git a/paddle/common/enforce.h b/paddle/common/enforce.h
index e7584ee98be23..856cf28d0221a 100644
--- a/paddle/common/enforce.h
+++ b/paddle/common/enforce.h
@@ -118,9 +118,9 @@ TEST_API int GetCallStackLevel();
 TEST_API std::string SimplifyErrorTypeFormat(const std::string& str);
 TEST_API std::string GetCurrentTraceBackString(bool for_signal = false);
 template <typename StrType>
-static std::string GetErrorSumaryString(StrType&& what,
-                                        const char* file,
-                                        int line) {
+static std::string GetErrorSummaryString(StrType&& what,
+                                         const char* file,
+                                         int line) {
   std::ostringstream sout;
   if (GetCallStackLevel() > 1) {
     sout << "\n----------------------\nError Message "
@@ -139,9 +139,9 @@ static std::string GetTraceBackString(StrType&& what,
   if (GetCallStackLevel() > 1) {
     // FLAGS_call_stack_level>1 means showing c++ call stack
     return ::common::enforce::GetCurrentTraceBackString() +
-           GetErrorSumaryString(what, file, line);
+           GetErrorSummaryString(what, file, line);
   } else {
-    return GetErrorSumaryString(what, file, line);
+    return GetErrorSummaryString(what, file, line);
   }
 }
 

From 4daec8acf8bbb052bb7ece2aa8f4051dcbaac723 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:27:30 +0800
Subject: [PATCH 52/82] =?UTF-8?q?=E3=80=90paddle=5Ftest=20No.40=E3=80=91re?=
 =?UTF-8?q?place=20of=20cc=5Ftest=20with=20paddle=5Ftest=20(#61945)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update CMakeLists.txt

* mv cc file

* add TEST_API

* delete use_op_itself

* Update test_reference_count_pass_last_lived_ops.cc

* Update CMakeLists.txt
---
 .../ir/memory_optimize_pass/CMakeLists.txt    |  11 -
 ...est_reference_count_pass_last_lived_ops.cc |   7 -
 paddle/fluid/framework/parallel_executor.h    |  20 +-
 test/cpp/fluid/framework/CMakeLists.txt       |   2 +
 test/cpp/fluid/framework/ir/CMakeLists.txt    |   1 +
 .../ir/memory_optimize_pass/CMakeLists.txt    |  14 ++
 .../share_varinfo_into_cinn_pass_test.cc      | 154 ++++++++++++
 ...est_reference_count_pass_last_lived_ops.cc | 228 ++++++++++++++++++
 8 files changed, 409 insertions(+), 28 deletions(-)
 create mode 100644 test/cpp/fluid/framework/ir/CMakeLists.txt
 create mode 100644 test/cpp/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
 create mode 100644 test/cpp/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
 create mode 100644 test/cpp/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc

diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 85923aafc23a7..222fef33c5ea6 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -41,11 +41,6 @@ if(WITH_CINN)
     share_varinfo_into_cinn_pass
     SRCS share_varinfo_into_cinn_pass.cc
     DEPS pass enforce common graph_helper)
-  cc_test(
-    share_varinfo_into_cinn_pass_test
-    SRCS share_varinfo_into_cinn_pass_test.cc
-    DEPS share_varinfo_into_cinn_pass parallel_executor elementwise_add_op
-         mul_op cinn_launch_op)
   list(APPEND EAGER_DELETETION_PASS_DEPS share_varinfo_into_cinn_pass)
 endif()
 
@@ -72,9 +67,3 @@ cc_library(
   inplace_addto_op_pass
   SRCS inplace_addto_op_pass.cc
   DEPS memory_reuse_pass)
-
-cc_test(
-  test_reference_count_pass_last_lived_ops
-  SRCS test_reference_count_pass_last_lived_ops.cc
-  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op phi
-       common)
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index fc2173f36316d..eeec6fd8788d4 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -23,13 +23,6 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-USE_OP_ITSELF(scale);
-USE_OP_ITSELF(elementwise_mul);
-USE_OP_ITSELF(elementwise_add);
-USE_OP_ITSELF(elementwise_add_grad);
-
-PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
-
 COMMON_DECLARE_double(eager_delete_tensor_gb);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 32514089763c6..29df757d17c8a 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -52,14 +52,14 @@ class ParallelExecutor {
   DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
 
  public:
-  explicit ParallelExecutor(const std::vector<platform::Place> &places,
-                            const std::vector<std::string> &bcast_vars,
-                            const std::string &loss_var_name,
-                            Scope *scope,
-                            const std::vector<Scope *> &local_scopes,
-                            const ExecutionStrategy &exec_strategy,
-                            const BuildStrategy &build_strategy,
-                            ir::Graph *graph);
+  TEST_API explicit ParallelExecutor(const std::vector<platform::Place> &places,
+                                     const std::vector<std::string> &bcast_vars,
+                                     const std::string &loss_var_name,
+                                     Scope *scope,
+                                     const std::vector<Scope *> &local_scopes,
+                                     const ExecutionStrategy &exec_strategy,
+                                     const BuildStrategy &build_strategy,
+                                     ir::Graph *graph);
 
   // NOTE(Aurelius84): Construct a PE running on single device for @to_static
   explicit ParallelExecutor(const platform::Place &place,
@@ -68,7 +68,7 @@ class ParallelExecutor {
                             const BuildStrategy &build_strategy,
                             ir::Graph *graph);
 
-  ~ParallelExecutor();
+  TEST_API ~ParallelExecutor();
 
   size_t DeviceCount() const;
 
@@ -98,7 +98,7 @@ class ParallelExecutor {
   void ResetOpHandleScopeMapOfGraphs(
       const std::unordered_map<Scope *, Scope *> &scope_map);
 
-  const ir::Graph &Graph() const;
+  TEST_API const ir::Graph &Graph() const;
   void PrepareVariables(Scope *scope);
 
   void SkipMemoryReuse(size_t scope_idx,
diff --git a/test/cpp/fluid/framework/CMakeLists.txt b/test/cpp/fluid/framework/CMakeLists.txt
index 5e0e7404f6999..8e1686b242993 100644
--- a/test/cpp/fluid/framework/CMakeLists.txt
+++ b/test/cpp/fluid/framework/CMakeLists.txt
@@ -346,3 +346,5 @@ cc_test(
   workqueue_test
   SRCS new_executor/workqueue_test.cc
   DEPS standalone_executor)
+
+add_subdirectory(ir)
diff --git a/test/cpp/fluid/framework/ir/CMakeLists.txt b/test/cpp/fluid/framework/ir/CMakeLists.txt
new file mode 100644
index 0000000000000..81a68ccb22f83
--- /dev/null
+++ b/test/cpp/fluid/framework/ir/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(memory_optimize_pass)
diff --git a/test/cpp/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/test/cpp/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
new file mode 100644
index 0000000000000..841ebd7c0fcc0
--- /dev/null
+++ b/test/cpp/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -0,0 +1,14 @@
+if(WITH_CINN)
+  paddle_test(share_varinfo_into_cinn_pass_test SRCS
+              share_varinfo_into_cinn_pass_test.cc)
+  list(APPEND EAGER_DELETETION_PASS_DEPS share_varinfo_into_cinn_pass)
+endif()
+
+paddle_test(test_reference_count_pass_last_lived_ops SRCS
+            test_reference_count_pass_last_lived_ops.cc DEPS common)
+
+if(WITH_ONNXRUNTIME AND WIN32)
+  # Copy onnxruntime for some c++ test in Windows, since the test will
+  # be build only in CI, so suppose the generator in Windows is Ninja.
+  copy_onnx(test_reference_count_pass_last_lived_ops)
+endif()
diff --git a/test/cpp/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc b/test/cpp/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
new file mode 100644
index 0000000000000..1f78e293a21a3
--- /dev/null
+++ b/test/cpp/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass_test.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+USE_OP_ITSELF(mul);
+USE_OP_ITSELF(elementwise_add);
+
+USE_OP_ITSELF(cinn_launch);
+PD_DECLARE_KERNEL(cinn_launch, CPU, ALL_LAYOUT);
+#ifdef PADDLE_WITH_CUDA
+PD_DECLARE_KERNEL(cinn_launch, GPU, ALL_LAYOUT);
+#endif
+
+namespace paddle::framework {
+
+using Name2VarInfoMap =
+    std::unordered_map<std::string, std::shared_ptr<ir::MemOptVarInfo>>;
+
+static ProgramDesc BuildProgramInsideCinnLaunchOp() {
+  ProgramDesc program;
+  auto* block = program.MutableBlock(0);
+  block->Var("var1");
+  block->Var("var2");
+  block->Var("var3");
+  block->Var("var4");
+  block->Var("var5");
+
+  auto add_op =
+      std::unique_ptr<OpDesc>(new OpDesc("elementwise_add",
+                                         {{"X", {"var1"}}, {"Y", {"var2"}}},
+                                         {{"Out", {"var3"}}},
+                                         {}));
+  block->AppendAllocatedOp(std::move(add_op));
+  auto mul_op = std::unique_ptr<OpDesc>(new OpDesc(
+      "mul", {{"X", {"var3"}}, {"Y", {"var4"}}}, {{"Out", {"var5"}}}, {}));
+  block->AppendAllocatedOp(std::move(mul_op));
+  return program;
+}
+
+static ProgramDesc BuildProgramWithCinnLaunchOp(int64_t compilation_key) {
+  // create a cinn_launch op
+  ProgramDesc program;
+  auto* block = program.MutableBlock(0);
+  block->Var("var1");
+  block->Var("var2");
+  block->Var("var4");
+  block->Var("var5");
+
+  auto cinn_launch_op = std::unique_ptr<OpDesc>(
+      new OpDesc("cinn_launch",
+                 {{"X", {"var1", "var2", "var4"}}},
+                 {{"Out", {"var5"}}},
+                 {{"compilation_key", compilation_key}}));
+  block->AppendAllocatedOp(std::move(cinn_launch_op));
+  return program;
+}
+
+struct TestPassContext {
+  explicit TestPassContext(const ProgramDesc& program) {
+    graph = std::make_unique<ir::Graph>(program);
+    details::BuildStrategy build_strategy;
+    details::ExecutionStrategy exec_strategy;
+    exec_strategy.use_device_ = paddle::platform::kCUDA;
+    executor.reset(new ParallelExecutor(platform::CUDAPlace(0),
+                                        &scope,
+                                        exec_strategy,
+                                        build_strategy,
+                                        graph.get()));
+  }
+
+  Scope scope;
+  std::unique_ptr<ir::Graph> graph;
+  std::unique_ptr<ParallelExecutor> executor;
+};
+
+TEST(ShareMemInfoToSubGraphPassTest, test_main_graph_share_varinfo) {
+  // add a subgraph to CinnCompiler
+  auto subgraph = std::make_unique<ir::Graph>(BuildProgramInsideCinnLaunchOp());
+  subgraph->GetOrInit<Name2VarInfoMap>(
+      paddle2cinn::kMemOptVarInfoFromMainGraph);
+  auto compilation_key =
+      paddle2cinn::CinnCompiler::GetInstance()->AddGraph(std::move(subgraph));
+
+  // build test data and apply pass
+  auto context = std::make_unique<TestPassContext>(
+      BuildProgramWithCinnLaunchOp(compilation_key));
+
+  // check result
+  const ir::Graph& result_subgraph =
+      paddle2cinn::CinnCompiler::GetInstance()->FindGraph(compilation_key);
+  const auto& dst_varinfo_map = result_subgraph.Get<Name2VarInfoMap>(
+      paddle2cinn::kMemOptVarInfoFromMainGraph);
+  ASSERT_EQ(dst_varinfo_map.size(), 4);
+  EXPECT_EQ(dst_varinfo_map.count("var1"), 1);
+  EXPECT_EQ(dst_varinfo_map.count("var5"), 1);
+  EXPECT_EQ(dst_varinfo_map.at("var1").use_count(), 2);
+  EXPECT_EQ(dst_varinfo_map.at("var5").use_count(), 2);
+}
+
+TEST(ShareMemInfoToSubGraphPassTest, test_subgraph_take_varinfo) {
+  // build test data and apply pass
+  auto context =
+      std::make_unique<TestPassContext>(BuildProgramInsideCinnLaunchOp());
+  auto& varinfo_map_shared = context->graph->GetOrInit<Name2VarInfoMap>(
+      paddle2cinn::kMemOptVarInfoFromMainGraph);
+  varinfo_map_shared = {
+      {"var1", std::make_shared<ir::MemOptVarInfo>("var1", 1)},
+      {"var2", std::make_shared<ir::MemOptVarInfo>("var2", 2)},
+  };
+
+  ir::MemOptVarInfoMapList varinfo_maps(1);
+  auto& dst_varinfo_map = varinfo_maps.front();
+  dst_varinfo_map = {{"var1", std::make_shared<ir::MemOptVarInfo>("var1", 1)},
+                     {"var2", std::make_shared<ir::MemOptVarInfo>("var2", 1)},
+                     {"var3", std::make_shared<ir::MemOptVarInfo>("var3", 1)},
+                     {"var4", std::make_shared<ir::MemOptVarInfo>("var4", 1)},
+                     {"var5", std::make_shared<ir::MemOptVarInfo>("var5", 1)}};
+  auto share_pass =
+      ir::PassRegistry::Instance().Get("share_varinfo_into_cinn_pass");
+  share_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &varinfo_maps);
+  share_pass->Apply(context->graph.get());
+
+  // check result
+  ASSERT_NE(dst_varinfo_map.at("var1")->ParentHolder(), nullptr);
+  ASSERT_NE(dst_varinfo_map.at("var2")->ParentHolder(), nullptr);
+  ASSERT_EQ(dst_varinfo_map.at("var3")->ParentHolder(), nullptr);
+  ASSERT_EQ(dst_varinfo_map.at("var4")->ParentHolder(), nullptr);
+  ASSERT_EQ(dst_varinfo_map.at("var5")->ParentHolder(), nullptr);
+}
+
+}  // namespace paddle::framework
diff --git a/test/cpp/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/test/cpp/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
new file mode 100644
index 0000000000000..eeec6fd8788d4
--- /dev/null
+++ b/test/cpp/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -0,0 +1,228 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/common/flags.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/parallel_executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+COMMON_DECLARE_double(eager_delete_tensor_gb);
+
+namespace paddle {
+namespace framework {
+namespace p = paddle::platform;
+
+static std::vector<platform::Place> CreatePlaces(size_t num, bool use_cuda) {
+  std::vector<platform::Place> result;
+  result.reserve(num);
+  for (size_t i = 0; i < num; ++i) {
+    if (use_cuda) {
+      result.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
+    } else {
+      result.emplace_back(platform::CPUPlace());
+    }
+  }
+  return result;
+}
+
+static void NewVar(BlockDesc *block,
+                   const std::string &name,
+                   const std::vector<int64_t> &shape) {
+  auto *var_desc = block->Var(name);
+  var_desc->SetShape(shape);
+}
+
+static void AppendOp(BlockDesc *block,
+                     const std::string &type,
+                     VariableNameMap inputs,
+                     VariableNameMap outputs,
+                     AttributeMap attrs) {
+  auto &op_info = OpInfoMap::Instance().Get(type);
+  if (op_info.Checker()) {
+    op_info.Checker()->Check(&attrs);
+  }
+
+  auto *op = block->AppendOp();
+  op->SetType(type);
+  for (auto &pair : inputs) {
+    op->SetInput(pair.first, pair.second);
+  }
+
+  for (auto &pair : outputs) {
+    op->SetOutput(pair.first, pair.second);
+    for (auto &var_name : pair.second) {
+      if (!block->FindVarRecursive(var_name)) {
+        NewVar(block, var_name, {});
+      }
+    }
+  }
+
+  op->SetAttrMap(attrs);
+  op->InferVarType(block);
+  op->InferShape(*block);
+}
+
+class ReferenceCountPassTestHelper {
+ public:
+  ReferenceCountPassTestHelper(const ProgramDesc &program, bool use_cuda)
+      : graph_(program) {
+    details::BuildStrategy build_strategy;
+    build_strategy.enable_inplace_ = false;
+    build_strategy.memory_optimize_ = false;
+    FLAGS_eager_delete_tensor_gb = -1;
+
+    details::ExecutionStrategy exec_strategy;
+    exec_strategy.use_device_ = use_cuda ? p::kCUDA : p::kCPU;
+
+    executor_ = std::make_unique<ParallelExecutor>(CreatePlaces(1, use_cuda),
+                                                   std::vector<std::string>(),
+                                                   "",
+                                                   &scope_,
+                                                   std::vector<Scope *>(),
+                                                   exec_strategy,
+                                                   build_strategy,
+                                                   &graph_);
+
+    auto ref_cnt_pass =
+        ir::PassRegistry::Instance().Get("reference_count_pass");
+    ref_cnt_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
+    ref_cnt_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars_);
+    ref_cnt_pass->Apply(&const_cast<ir::Graph &>(executor_->Graph()));
+  }
+
+  bool IsLastLivedOps(const std::string &name,
+                      std::vector<std::string> ops) const {
+    std::sort(ops.begin(), ops.end());
+    return LastLivedOpTypes(name) == ops;
+  }
+
+  std::vector<OperatorBase *> LastLivedOps(const std::string &name) const {
+    auto &ops = last_live_ops_of_vars_[0].at(name).ops();
+    std::vector<OperatorBase *> ret;
+    ret.reserve(ops.size());
+    for (auto *op : ops) {
+      ret.emplace_back(op->GetOp());
+    }
+    return ret;
+  }
+
+ private:
+  std::vector<std::string> LastLivedOpTypes(const std::string &name) const {
+    auto iter = last_live_ops_of_vars_[0].find(name);
+    std::vector<std::string> ret;
+    if (iter != last_live_ops_of_vars_[0].end()) {
+      for (auto *op : iter->second.ops()) {
+        ret.emplace_back(op->GetOp()->Type());
+      }
+    }
+    std::sort(ret.begin(), ret.end());
+    return ret;
+  }
+
+ private:
+  ir::Graph graph_;
+  Scope scope_;
+  std::unique_ptr<ParallelExecutor> executor_;
+
+  ir::MemOptVarInfoMapList mem_opt_var_infos_;
+  std::vector<ir::LastLiveOpsOfVars> last_live_ops_of_vars_;
+};
+
+TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{{3, 4, 5}};
+
+  /**
+   * The network is:
+   *
+   * x0 = fluid.layer.data(...)
+   * x1 = scale(x0, scale=1)
+   * x2 = scale(x1, scale=2)
+   * x3 = elementwise_mul(x1, x2)
+   * scale(x3, out=x1, scale=3) # produce a new version of x1
+   * x4, x5 = elementwise_add_grad(dout=x3, x=x2, y=x1)
+   * x6 = elementwise_mul(x4, x5)
+   * x7 = elementwise_add(x5, x5)
+   */
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+  std::string x3 = "x3";
+  std::string x4 = "x4";
+  std::string x5 = "x5";
+  std::string x6 = "x6";
+  std::string x7 = "x7";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "scale", {{"X", {x0}}}, {{"Out", {x1}}}, {{"scale", 1.0f}});
+  AppendOp(block, "scale", {{"X", {x1}}}, {{"Out", {x2}}}, {{"scale", 2.0f}});
+  AppendOp(block,
+           "elementwise_mul",
+           {{"X", {x1}}, {"Y", {x2}}},
+           {{"Out", {x3}}},
+           {});
+  AppendOp(block, "scale", {{"X", {x3}}}, {{"Out", {x1}}}, {{"scale", 3.0f}});
+  AppendOp(block,
+           "elementwise_add_grad",
+           {{GradVarName("Out"), {x3}}, {"X", {x2}}, {"Y", {x1}}},
+           {{GradVarName("X"), {x4}}, {GradVarName("Y"), {x5}}},
+           {});
+  AppendOp(block,
+           "elementwise_mul",
+           {{"X", {x4}}, {"Y", {x5}}},
+           {{"Out", {x6}}},
+           {});
+  AppendOp(block,
+           "elementwise_add",
+           {{"X", {x5}}, {"Y", {x5}}},
+           {{"Out", {x7}}},
+           {});
+
+  std::vector<bool> use_cuda_list{false};
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  use_cuda_list.push_back(true);
+#endif
+  for (auto use_cuda : use_cuda_list) {
+    ReferenceCountPassTestHelper helper(program, use_cuda);
+    ASSERT_TRUE(helper.IsLastLivedOps(x0, {"scale"}));
+    ASSERT_EQ(PADDLE_GET_CONST(float,
+                               helper.LastLivedOps(x0)[0]->Attrs().at("scale")),
+              1.0f);
+
+    ASSERT_TRUE(helper.IsLastLivedOps(x1, {"scale"}));
+    ASSERT_EQ(PADDLE_GET_CONST(float,
+                               helper.LastLivedOps(x1)[0]->Attrs().at("scale")),
+              3.0f);
+
+    ASSERT_TRUE(helper.IsLastLivedOps(x2, {"elementwise_mul"}));
+    ASSERT_TRUE(helper.IsLastLivedOps(x3, {"elementwise_add_grad"}));
+
+    ASSERT_TRUE(helper.IsLastLivedOps(x4, {"elementwise_mul"}));
+    ASSERT_TRUE(
+        helper.IsLastLivedOps(x5, {"elementwise_mul", "elementwise_add"}));
+
+    ASSERT_TRUE(helper.IsLastLivedOps(x6, {"elementwise_mul"}));
+    ASSERT_TRUE(helper.IsLastLivedOps(x7, {"elementwise_add"}));
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle

From a6e926856b2ac84b8635ccbde66ae6c14d5a9cc7 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:29:13 +0800
Subject: [PATCH 53/82] Fix some typos (pendding_ops, altertively, etc.)
 (#61765)

---
 python/paddle/base/backward.py                | 10 ++++-----
 python/paddle/base/executor.py                |  6 ++---
 .../incubate/checkpoint/auto_checkpoint.py    |  2 +-
 python/paddle/nn/layer/layers.py              | 22 +++++++++----------
 test/dygraph_to_static/test_legacy_error.py   |  2 +-
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index 75e5899afdece..9f39d9c3ea03f 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -830,7 +830,7 @@ class Var:
         def __init__(self, var_name):
             self.var_name = var_name
             self.gen_op = None
-            self.pendding_ops = []
+            self.pending_ops = []
 
         def set_gen_op(self, gen_op):
             assert isinstance(gen_op, Op)
@@ -839,7 +839,7 @@ def set_gen_op(self, gen_op):
 
         def add_pending_op(self, op):
             assert isinstance(op, Op)
-            self.pendding_ops.append(op)
+            self.pending_ops.append(op)
 
     class Op:
         def __init__(self, op_desc):
@@ -916,8 +916,8 @@ def _create_op_node(op_desc):
             op_node = candidate_ops.pop(0)
             if _all_in_set_(op_node.inputs, ready_vars):
                 for out_var in op_node.outputs:
-                    candidate_ops.extend(out_var.pendding_ops)
-                    op_list.extend(out_var.pendding_ops)
+                    candidate_ops.extend(out_var.pending_ops)
+                    op_list.extend(out_var.pending_ops)
                 ready_vars.update(op_node.outputs)
             else:
                 remove_ops = False
@@ -1571,7 +1571,7 @@ def find_op_index(block_desc, cur_op_desc):
             # NOTE: In primitive mode, the intermediate variable generated by
             # decompositing raw grad op are not satisfied the rule of 'XX@GRAD',
             # which will cause it be pruned according to current pruning logic.
-            # For simplicity, we treate all prmitive operators as one raw
+            # For simplicity, we treat all primitive operators as one raw
             # operator, and keep the pruning logic consistent with currently
             # logic. The drawback of this solution is may lead to some primitive
             # operators are not pruned, which is needed to fixed.
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 6fe3f71d481ca..3162d27e05059 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -739,7 +739,7 @@ def _as_lodtensor(data, place, dtype=None):
             data = np.array(data)
             if data.dtype == np.object_:
                 raise TypeError(
-                    "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
+                    "\n\tFailed to convert input data to a regular ndarray :\n\t* Usually "
                     "this means the input data contains nested lists with different lengths. "
                     "Please consider using 'base.create_lod_tensor' to convert it to a LoD-Tensor."
                 )
@@ -1675,7 +1675,7 @@ def run(
                 needed to generate :code:`fetch_list` will be pruned. The default is False, which means the
                 program will not pruned and all the operators and variables will be executed during running.
                 Note that if the tuple returned from :code:`Optimizer.minimize()` is passed to :code:`fetch_list`,
-                :code:`use_prune` will be overrided to True, and the program will be pruned.
+                :code:`use_prune` will be overridden to True, and the program will be pruned.
 
         Returns:
 
@@ -1880,7 +1880,7 @@ def _run_impl(
         if scope is None:
             scope = global_scope()
 
-        # use_prune can be overrided by putting optimize_ops in fetch_list
+        # use_prune can be overridden by putting optimize_ops in fetch_list
         _origin_fetch_list = fetch_list
         _origin_program = program
         fetch_list, optimize_ops = self._split_optimize_ops_in_fetch_list(
diff --git a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
index 9bf737fb055dc..742289acd27f1 100644
--- a/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/base/incubate/checkpoint/auto_checkpoint.py
@@ -293,7 +293,7 @@ def __init__(
             self._save_checkpoint_inter = self._checker.save_checkpoint_inter
         assert (
             self._save_checkpoint_inter >= 0
-        ), f"checkpointer:{self._save_checkpoint_inter} must >=0"
+        ), f"checkpoint inter:{self._save_checkpoint_inter} must >=0"
         self._last_checkpoint_time = time.time()
 
         self._load_cp_nos = None
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index 36810ba974d24..a4f20abb97c7f 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -302,7 +302,7 @@ def is_instance(self, param, cls):
             )
 
 
-class LayerOpsRecoder:
+class LayerOpsRecorder:
     """
     Record generated operators information in nn.Layer.
     """
@@ -405,7 +405,7 @@ def __init__(self, name_scope=None, dtype="float32"):
         self._loaddict_holder = collections.OrderedDict()
 
         # Record generated op_descs in this layer
-        self._op_recorder = LayerOpsRecoder(ops=[], hooks=[])
+        self._op_recorder = LayerOpsRecorder(ops=[], hooks=[])
         self._customized_attrs = {}
 
         self._forward_pre_hooks = collections.OrderedDict()
@@ -636,7 +636,7 @@ def register_forward_post_hook(self, hook):
 
                 >>> # the forward_post_hook change the output of the layer: output = output * 2
                 >>> def forward_post_hook(layer, input, output):
-                ...     # user can use layer, input and output for information statistis tasks
+                ...     # user can use layer, input and output for information statistics tasks
                 ...
                 ...     # change the output
                 ...     return output * 2
@@ -690,7 +690,7 @@ def register_forward_pre_hook(self, hook):
 
                 >>> # the forward_pre_hook change the input of the layer: input = input * 2
                 >>> def forward_pre_hook(layer, input):
-                ...     # user can use layer and input for information statistis tasks
+                ...     # user can use layer and input for information statistics tasks
                 ...
                 ...     # change the input
                 ...     input_return = (input[0] * 2)
@@ -998,7 +998,7 @@ def astype(self, dtype=None):
             return self
         else:
             raise ValueError(
-                "dtype value error, must be 'bfloat16', 'float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'complex64', 'complex128', 'bool', or paddle.dtype, numpy.dtype, but recieve "
+                "dtype value error, must be 'bfloat16', 'float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8', 'complex64', 'complex128', 'bool', or paddle.dtype, numpy.dtype, but receive "
                 + str(dtype)
             )
 
@@ -1951,7 +1951,7 @@ def to_static_state_dict(
             include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True.
             use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True.
 
-        Retruns:
+        Returns:
             dict, a dict contains all the parameters and persistable buffers.
 
         Examples:
@@ -1988,7 +1988,7 @@ def state_dict(
             include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True.
             use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True.
 
-        Retruns:
+        Returns:
             dict: a dict contains all the parameters and persistable buffers.
 
         Examples:
@@ -2049,7 +2049,7 @@ def _check_match(key, param):
                 if len(state) != len(param):
                     missing_keys.append(key)
                     raise ValueError(
-                        f"{key} receieves the length of {len(state)}, "
+                        f"{key} receives the length of {len(state)}, "
                         f"but the expected shape is {len(param)}"
                     )
                 else:
@@ -2126,7 +2126,7 @@ def _set_var(var, ndarray):
                     _set_var(param, state)
             except ValueError as e:
                 raise ValueError(
-                    "This error might happens in dy2static, while calling 'set_state_dict' dynamicly in 'forward', which is not supported. If you only need call 'set_state_dict' once, move it to '__init__'."
+                    "This error might happens in dy2static, while calling 'set_state_dict' dynamically in 'forward', which is not supported. If you only need call 'set_state_dict' once, move it to '__init__'."
                 )
 
         return missing_keys, unexpected_keys
@@ -2230,7 +2230,7 @@ def _transform(self, t, device, dtype, blocking):
         if t.place.is_gpu_place():
             # for gpu, minimum memory allocation unit is 256 bytes.
             size_dtype = core.size_of_dtype(dtype)
-            # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
+            # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will compute ‘t’ occupied memory space.
             # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
             waiting_alloc_memory = (
                 ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
@@ -2345,7 +2345,7 @@ def transform(t, device, dtype, blocking):
 
     def _startup_program(self):
         """
-        Return starup program containing initialization operations of all parameters.
+        Return startup program containing initialization operations of all parameters.
 
         NOTE(dev): This is a very low level API and only for inner developer.
         """
diff --git a/test/dygraph_to_static/test_legacy_error.py b/test/dygraph_to_static/test_legacy_error.py
index c1225d3b83f03..faa1d34adaddd 100644
--- a/test/dygraph_to_static/test_legacy_error.py
+++ b/test/dygraph_to_static/test_legacy_error.py
@@ -453,7 +453,7 @@ def test_set_state_dict_err(self):
         error_message = str(new_exception)
 
         self.assertIn(
-            "This error might happens in dy2static, while calling 'set_state_dict' dynamicly in 'forward', which is not supported. If you only need call 'set_state_dict' once, move it to '__init__'.",
+            "This error might happens in dy2static, while calling 'set_state_dict' dynamically in 'forward', which is not supported. If you only need call 'set_state_dict' once, move it to '__init__'.",
             error_message,
         )
 

From 735fd08419c64b6afb7a82ce1c58a30a66fcd1c3 Mon Sep 17 00:00:00 2001
From: zhengzhonghui <zhengzhonghui@baidu.com>
Date: Mon, 26 Feb 2024 10:29:57 +0800
Subject: [PATCH 54/82] [AutoParallel] Add Global mesh and sub mesh reshard
 function (#61796)

* add global and sub mesh reshard function

* add unittest

* use broadcast kernel

* rm unused header

* revert broadcast

* polish code
---
 .../auto_parallel/reshard/CMakeLists.txt      |   1 +
 .../global_and_sub_mesh_reshard_function.cc   | 137 ++++++++++++++++++
 .../global_and_sub_mesh_reshard_function.h    |  49 +++++++
 .../reshard/nd_mesh_reshard_function.cc       |   7 +-
 .../reshard/reshard_function_registry.cc      |   3 +
 .../auto_parallel/reshard/reshard_utils.cc    |  37 +++++
 .../auto_parallel/reshard/reshard_utils.h     |   3 +
 .../reshard/s_to_s_reshard_function.cc        |   6 +
 paddle/phi/kernels/cpu/broadcast_kernel.cc    |   1 +
 paddle/phi/kernels/gpu/broadcast_kernel.cu    |   2 +
 .../hybrid_strategy/CMakeLists.txt            |   7 +
 ...mi_auto_parallel_2d_global_mesh_reshard.py |  65 +++++++++
 ...mi_auto_parallel_3d_global_mesh_reshard.py |  72 +++++++++
 .../test_global_mesh_reshard.py               |  73 ++++++++++
 .../hybrid_strategy/testslist.csv             |   1 +
 15 files changed, 462 insertions(+), 2 deletions(-)
 create mode 100644 paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.cc
 create mode 100644 paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.h
 create mode 100644 test/auto_parallel/hybrid_strategy/semi_auto_parallel_2d_global_mesh_reshard.py
 create mode 100644 test/auto_parallel/hybrid_strategy/semi_auto_parallel_3d_global_mesh_reshard.py
 create mode 100644 test/auto_parallel/hybrid_strategy/test_global_mesh_reshard.py

diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/reshard/CMakeLists.txt
index 133a8c01de9f4..9699c7b25eadf 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/CMakeLists.txt
@@ -14,4 +14,5 @@ collect_srcs(
   r_to_x_reshard_function.cc
   nd_mesh_reshard_function.cc
   same_status_reshard_function.cc
+  global_and_sub_mesh_reshard_function.cc
   reshard_function_registry.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.cc
new file mode 100644
index 0000000000000..8cb78b9c7719b
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.h"
+
+#include "glog/logging.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
+#include "paddle/phi/core/distributed/store/store_utils.h"
+#include "paddle/phi/kernels/p_recv_kernel.h"
+#include "paddle/phi/kernels/p_send_kernel.h"
+
+namespace phi {
+namespace distributed {
+
+bool GlobalToSubMeshReshardFunction::IsSuitable(
+    const DistTensor& in, const TensorDistAttr& out_dist_attr) {
+  const TensorDistAttr& in_dist_attr = in.dist_attr();
+  // 1. first dimension(pp) must be replicated
+  RESHARD_SHORTCUT_IF_FALSE(in_dist_attr.is_replicated(0));
+  // 2. out mesh is the value of a certain dimension of global mesh
+  // e.g. global_mesh = [[1, 2], [3, 4]], out_mesh = [1, 2] or [3, 4]
+  //      global_mesh = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+  //      out_mesh = [[1, 2], [3, 4]] or [[5, 6], [7, 8]]
+
+  const ProcessMesh& in_process_mesh = in_dist_attr.process_mesh();
+  const ProcessMesh& out_process_mesh = out_dist_attr.process_mesh();
+
+  RESHARD_SHORTCUT_IF_FALSE(in_process_mesh.ndim() ==
+                            out_process_mesh.ndim() + 1);
+
+  return IsSubMesh(in_process_mesh, out_process_mesh);
+}
+
+void GlobalToSubMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
+                                          const DistTensor& in,
+                                          const TensorDistAttr& out_dist_attr,
+                                          DistTensor* out) {
+  VLOG(3) << "Call GlobalToSubMeshReshardFunction Eval";
+  const DenseTensor& in_dense_value = in.value();
+  const ProcessMesh& out_process_mesh = out_dist_attr.process_mesh();
+  if (IsCurRankInMesh(out_process_mesh)) {
+    SetValue(out, in_dense_value);
+  } else {
+    *(out->unsafe_mutable_value()) =
+        phi::DenseTensor(std::make_shared<phi::Allocation>(
+                             nullptr, 0, phi::distributed::GetDefaultPlace()),
+                         in.value().meta());
+  }
+  SetDistProps(out, in.dims(), out_dist_attr);
+}
+
+bool SubMeshToGlobalReshardFunction::IsSuitable(
+    const DistTensor& in, const TensorDistAttr& out_dist_attr) {
+  RESHARD_SHORTCUT_IF_FALSE(out_dist_attr.is_replicated(0));
+
+  const TensorDistAttr& in_dist_attr = in.dist_attr();
+  const ProcessMesh& in_process_mesh = in_dist_attr.process_mesh();
+  const ProcessMesh& out_process_mesh = out_dist_attr.process_mesh();
+
+  RESHARD_SHORTCUT_IF_FALSE(in_process_mesh.ndim() ==
+                            out_process_mesh.ndim() - 1);
+
+  return IsSubMesh(out_process_mesh, in_process_mesh);
+}
+
+void SubMeshToGlobalReshardFunction::Eval(phi::DeviceContext* dev_ctx,
+                                          const DistTensor& in,
+                                          const TensorDistAttr& out_dist_attr,
+                                          DistTensor* out) {
+  VLOG(3) << "Call SubMeshToGlobalReshardFunction Eval";
+  const TensorDistAttr& in_dist_attr = in.dist_attr();
+  const ProcessMesh& in_process_mesh = in_dist_attr.process_mesh();
+  const ProcessMesh& out_process_mesh = out_dist_attr.process_mesh();
+
+  std::vector<ProcessMesh> sub_process_meshes = GetSubMeshes(out_process_mesh);
+  const std::vector<int64_t>& in_process_ids = in_process_mesh.process_ids();
+  const std::vector<int64_t>& out_process_ids = out_process_mesh.process_ids();
+  std::unordered_map<int64_t, std::vector<int64_t>> send2recv_map;
+  std::unordered_map<int64_t, int64_t> recv2send_map;
+
+  for (const ProcessMesh& sub_mesh : sub_process_meshes) {
+    if (sub_mesh == in_process_mesh) {
+      continue;
+    }
+    const std::vector<int64_t>& sub_process_ids = sub_mesh.process_ids();
+    for (size_t i = 0; i < sub_process_ids.size(); ++i) {
+      int64_t send_id = in_process_ids[i];
+      send2recv_map[send_id].push_back(sub_process_ids[i]);
+      recv2send_map[sub_process_ids[i]] = send_id;
+    }
+  }
+
+  std::vector<int64_t> all_process_ids =
+      GetUnionProcessIds(in_process_ids, out_process_ids);
+  int64_t cur_global_rank = GetCurGlobalRank();
+  DataType dtype = in.dtype();
+  if (IsCurRankInMesh(in_process_mesh)) {
+    const DenseTensor& in_dense_value = in.value();
+    std::vector<int64_t>& recv_vec = send2recv_map[cur_global_rank];
+    for (int64_t recv_id : recv_vec) {
+      RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                                PSendKernel,
+                                dtype,
+                                all_process_ids,
+                                in_dense_value,
+                                recv_id,
+                                true /*dynamic_shape*/);
+    }
+    SetValue(out, in_dense_value);
+  } else {
+    int64_t send_id = recv2send_map[cur_global_rank];
+    RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                              PRecv,
+                              dtype,
+                              all_process_ids,
+                              send_id,
+                              true /*dynamic_shape*/,
+                              GetMutableTensor(out));
+  }
+  SetDistProps(out, in.dims(), out_dist_attr);
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.h b/paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.h
new file mode 100644
index 0000000000000..e93a454520ff3
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_function.h"
+
+namespace phi {
+namespace distributed {
+
+class GlobalToSubMeshReshardFunction final : public ReshardFunction {
+ public:
+  bool IsSuitable(const DistTensor& in,
+                  const TensorDistAttr& out_dist_attr) override;
+
+  void Eval(DeviceContext* dev_ctx,
+            const DistTensor& in,
+            const TensorDistAttr& out_dist_attr,
+            DistTensor* out) override;
+
+  std::string Name() override { return "GlobalToSubMeshReshardFunction"; }
+};
+
+class SubMeshToGlobalReshardFunction final : public ReshardFunction {
+ public:
+  bool IsSuitable(const DistTensor& in,
+                  const TensorDistAttr& out_dist_attr) override;
+
+  void Eval(DeviceContext* dev_ctx,
+            const DistTensor& in,
+            const TensorDistAttr& out_dist_attr,
+            DistTensor* out) override;
+
+  std::string Name() override { return "SubMeshToGlobalReshardFunction"; }
+};
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
index 82ddfc6354934..b7a6679590e63 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.cc
@@ -273,8 +273,11 @@ void SameNdMeshReshardFunction::Eval(phi::DeviceContext* dev_ctx,
 
 bool CrossNdMeshReshardFunction::IsSuitable(
     const DistTensor& in, const TensorDistAttr& out_dist_attr) {
-  RESHARD_SHORTCUT_IF_FALSE(in.dist_attr().process_mesh() !=
-                            out_dist_attr.process_mesh());
+  const ProcessMesh& in_process_mesh = in.dist_attr().process_mesh();
+  const ProcessMesh& out_process_mesh = out_dist_attr.process_mesh();
+  RESHARD_SHORTCUT_IF_FALSE(in_process_mesh != out_process_mesh);
+  RESHARD_SHORTCUT_IF_FALSE(in_process_mesh.shape() ==
+                            out_process_mesh.shape());
   RESHARD_SHORTCUT_IF_FALSE(out_dist_attr.process_mesh().ndim() > 1);
 
   // check the input and output dims_mapping is not equal
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function_registry.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function_registry.cc
index 3e7ad115999a2..eb2e8527e87c6 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function_registry.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_function_registry.cc
@@ -17,6 +17,7 @@
 #include "glog/logging.h"
 
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard/global_and_sub_mesh_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/nd_mesh_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/p_to_r_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard/p_to_s_reshard_function.h"
@@ -75,6 +76,8 @@ REGISTER_RESHARD_FUNC(RToXExpandReshardFunction);
 REGISTER_RESHARD_FUNC(SameStatusReshardFunction);
 REGISTER_RESHARD_FUNC(SameNdMeshReshardFunction);
 REGISTER_RESHARD_FUNC(CrossNdMeshReshardFunction);
+REGISTER_RESHARD_FUNC(GlobalToSubMeshReshardFunction);
+REGISTER_RESHARD_FUNC(SubMeshToGlobalReshardFunction);
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
index c7dd423d62e52..a2a769ef3a2d4 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
@@ -221,5 +221,42 @@ phi::DDim InferShapeForReshardFromReplicate(
   return out_dim;
 }
 
+// 1. Get all the sub meshes of global_mesh
+// e.g. global_mesh = [[1, 2], [3, 4]], out_mesh = [1, 2] and [3, 4]
+//      global_mesh = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]]
+//      out_mesh = [[1, 2], [3, 4]] and [[5, 6], [7, 8]]
+std::vector<ProcessMesh> GetSubMeshes(const ProcessMesh& process_mesh) {
+  const std::vector<int64_t>& shape = process_mesh.shape();
+  const std::vector<int64_t>& process_ids = process_mesh.process_ids();
+  const std::vector<std::string>& dim_names = process_mesh.dim_names();
+  int64_t total_process_num = process_ids.size();
+  int64_t sub_process_num = total_process_num / shape[0];
+  std::vector<int64_t> sub_process_mesh_shape(shape.begin() + 1, shape.end());
+  std::vector<std::string> sub_process_mesh_dim_names(dim_names.begin() + 1,
+                                                      dim_names.end());
+
+  std::vector<ProcessMesh> sub_process_meshes;
+  for (int i = 0; i < shape[0]; ++i) {
+    int64_t start_position = i * sub_process_num;
+    int64_t end_position = start_position + sub_process_num;
+    std::vector<int64_t> sub_process_ids(process_ids.begin() + start_position,
+                                         process_ids.begin() + end_position);
+
+    sub_process_meshes.emplace_back(ProcessMesh(
+        sub_process_mesh_shape, sub_process_ids, sub_process_mesh_dim_names));
+  }
+  return sub_process_meshes;
+}
+
+bool IsSubMesh(const ProcessMesh& global_mesh, const ProcessMesh& sub_mesh) {
+  std::vector<ProcessMesh> sub_process_meshes = GetSubMeshes(global_mesh);
+  for (const ProcessMesh& mesh : sub_process_meshes) {
+    if (mesh == sub_mesh) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
index 5a63bef285825..8828222c4ceda 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
@@ -168,5 +168,8 @@ phi::DDim InferShapeForReshardFromReplicate(
     }                                   \
   } while (0)
 
+std::vector<ProcessMesh> GetSubMeshes(const ProcessMesh& process_mesh);
+bool IsSubMesh(const ProcessMesh& global_mesh, const ProcessMesh& sub_mesh);
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.cc
index 4ff30a13cd033..57b5e8209fce6 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/s_to_s_reshard_function.cc
@@ -32,6 +32,9 @@ bool SToSReshardFunction::IsSuitable(const DistTensor& in,
                                      const TensorDistAttr& out_dist_attr) {
   const auto& in_dist_attr = in.dist_attr();
 
+  RESHARD_SHORTCUT_IF_FALSE(in_dist_attr.dims_mapping() !=
+                            out_dist_attr.dims_mapping());
+
   RESHARD_SHORTCUT_IF_FALSE(in_dist_attr.is_shard());
   RESHARD_SHORTCUT_IF_FALSE(out_dist_attr.is_shard());
 
@@ -141,6 +144,9 @@ bool SToSReshardFunctionCrossMesh::IsSuitable(
     const DistTensor& in, const TensorDistAttr& out_dist_attr) {
   const auto& in_dist_attr = in.dist_attr();
 
+  RESHARD_SHORTCUT_IF_FALSE(in_dist_attr.dims_mapping() !=
+                            out_dist_attr.dims_mapping());
+
   RESHARD_SHORTCUT_IF_FALSE(in_dist_attr.is_shard());
   RESHARD_SHORTCUT_IF_FALSE(out_dist_attr.is_shard());
 
diff --git a/paddle/phi/kernels/cpu/broadcast_kernel.cc b/paddle/phi/kernels/cpu/broadcast_kernel.cc
index 0deb8d8bbc562..02b984112d83c 100644
--- a/paddle/phi/kernels/cpu/broadcast_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_kernel.cc
@@ -61,6 +61,7 @@ PD_REGISTER_KERNEL(broadcast,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::complex<float>,
diff --git a/paddle/phi/kernels/gpu/broadcast_kernel.cu b/paddle/phi/kernels/gpu/broadcast_kernel.cu
index e4986f752b1ae..9af8bd4d6d510 100644
--- a/paddle/phi/kernels/gpu/broadcast_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_kernel.cu
@@ -65,6 +65,7 @@ PD_REGISTER_KERNEL(broadcast,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::complex<float>,
@@ -80,6 +81,7 @@ PD_REGISTER_KERNEL(broadcast,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16,
                    phi::dtype::complex<float>,
diff --git a/test/auto_parallel/hybrid_strategy/CMakeLists.txt b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
index 2d205031a433e..9d19c4e08b64d 100644
--- a/test/auto_parallel/hybrid_strategy/CMakeLists.txt
+++ b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
@@ -58,3 +58,10 @@ if((WITH_GPU) AND (LINUX))
   set_tests_properties(test_semi_auto_parallel_hybrid_sharding_strategy
                        PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=HYBRID")
 endif()
+if((WITH_GPU) AND (LINUX))
+  py_test_modules(
+    test_global_mesh_reshard MODULES test_global_mesh_reshard ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_global_mesh_reshard PROPERTIES TIMEOUT "120" LABELS
+                                                           "RUN_TYPE=HYBRID")
+endif()
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_2d_global_mesh_reshard.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_2d_global_mesh_reshard.py
new file mode 100644
index 0000000000000..93c26a767ccd5
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_2d_global_mesh_reshard.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestSemiAutoParallel2DGlobalMeshReshard:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._global_mesh = dist.ProcessMesh(
+            [[0, 1], [2, 3]], dim_names=["pp", "dp"]
+        )
+        self._mesh0 = dist.ProcessMesh([0, 1], dim_names=["dp"])
+        self._mesh1 = dist.ProcessMesh([2, 3], dim_names=["dp"])
+        paddle.set_device(self._backend)
+
+    def test_basic(self):
+        input = paddle.ones(shape=[2, 3], dtype='float32')
+        input = dist.shard_tensor(
+            input, self._global_mesh, [dist.Replicate(), dist.Shard(0)]
+        )
+        input.stop_gradient = False
+        global_input = input + 1.0  # global_input: 2.0
+
+        # forward on pp0
+        input_pp0 = dist.reshard(global_input, self._mesh0, [dist.Shard(0)])
+        output = input_pp0 + 1.0  # output_pp0: 3.0
+
+        # forward on pp1
+        output = dist.reshard(output, self._mesh1, [dist.Shard(0)])
+        input_pp1 = dist.reshard(global_input, self._mesh1, [dist.Shard(0)])
+        output = input_pp1 + output  # output_pp1: 5.0
+        loss = paddle.sum(output)  # 30.0
+        np.testing.assert_allclose(loss.numpy(), 30.0, rtol=1e-06, verbose=True)
+        loss.backward()
+        np.testing.assert_allclose(
+            input.grad.numpy(),
+            np.full(shape=(2, 3), fill_value=2.0, dtype=np.float32),
+            rtol=1e-06,
+            verbose=True,
+        )
+
+    def run_test_case(self):
+        self.test_basic()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallel2DGlobalMeshReshard().run_test_case()
diff --git a/test/auto_parallel/hybrid_strategy/semi_auto_parallel_3d_global_mesh_reshard.py b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_3d_global_mesh_reshard.py
new file mode 100644
index 0000000000000..bdc256a8a6493
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/semi_auto_parallel_3d_global_mesh_reshard.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestSemiAutoParallel3DGlobalMeshReshard:
+    def __init__(self):
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._global_mesh = dist.ProcessMesh(
+            [[[0, 1], [2, 3]], [[4, 5], [6, 7]]], dim_names=['pp', 'dp', 'mp']
+        )
+        self._mesh0 = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=['dp', 'mp'])
+        self._mesh1 = dist.ProcessMesh([[4, 5], [6, 7]], dim_names=['dp', 'mp'])
+        paddle.set_device(self._backend)
+
+    def test_basic(self):
+        global_input = dist.shard_tensor(
+            paddle.ones(shape=[6, 8], dtype='float32'),
+            self._global_mesh,
+            [dist.Replicate(), dist.Replicate(), dist.Replicate()],
+        )  # 1.0
+        global_input.stop_gradient = False
+        # forward on mesh0
+        input_mesh0 = dist.reshard(
+            global_input, self._mesh0, [dist.Replicate(), dist.Replicate()]
+        )
+        output = input_mesh0 + 1.0  # 2.0
+
+        # forward on mesh1
+        output = dist.reshard(
+            output, self._mesh1, [dist.Replicate(), dist.Replicate()]
+        )
+        input_mesh1 = dist.reshard(
+            global_input, self._mesh1, [dist.Replicate(), dist.Replicate()]
+        )
+        output = output + input_mesh1  # 3.0
+        loss = paddle.sum(output)  # 144.0
+        np.testing.assert_allclose(
+            loss.numpy(), 144.0, rtol=1e-06, verbose=True
+        )
+        loss.backward()
+        np.testing.assert_allclose(
+            global_input.grad.numpy(),
+            np.full(shape=(6, 8), fill_value=2.0, dtype=np.float32),
+            rtol=1e-06,
+            verbose=True,
+        )
+
+    def run_test_case(self):
+        self.test_basic()
+
+
+if __name__ == '__main__':
+    TestSemiAutoParallel3DGlobalMeshReshard().run_test_case()
diff --git a/test/auto_parallel/hybrid_strategy/test_global_mesh_reshard.py b/test/auto_parallel/hybrid_strategy/test_global_mesh_reshard.py
new file mode 100644
index 0000000000000..48bea0b88efcd
--- /dev/null
+++ b/test/auto_parallel/hybrid_strategy/test_global_mesh_reshard.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestSemiAutoParallel2DGlobalMeshReshard(
+    test_base.CommunicationTestDistBase
+):
+    def setUp(self):
+        super().setUp(
+            num_of_devices=4,
+            timeout=120,
+            nnode=1,
+        )
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "2023",
+        }
+        self._changeable_envs = {"backend": ["gpu"]}
+
+    def test_2d_global_mesh_reshard(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_2d_global_mesh_reshard.py",
+                user_defined_envs=envs,
+            )
+
+
+class TestSemiAutoParallel3DGlobalMeshReshard(
+    test_base.CommunicationTestDistBase
+):
+    def setUp(self):
+        super().setUp(
+            num_of_devices=8,
+            timeout=120,
+            nnode=1,
+        )
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "2023",
+        }
+        self._changeable_envs = {"backend": ["gpu"]}
+
+    def test_3d_global_mesh_reshard(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_3d_global_mesh_reshard.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/hybrid_strategy/testslist.csv b/test/auto_parallel/hybrid_strategy/testslist.csv
index 7b64e2d93ea6b..9c1105615890b 100644
--- a/test/auto_parallel/hybrid_strategy/testslist.csv
+++ b/test/auto_parallel/hybrid_strategy/testslist.csv
@@ -6,3 +6,4 @@ test_semi_auto_parallel_c_cross_entropy,LINUX,GPU,120,HYBRID,test_runner.py,,,ht
 test_cross_mesh_reshard,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_semi_auto_parallel_llama_model_amp,LINUX,GPU,180,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_semi_auto_parallel_hybrid_sharding_strategy,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_global_mesh_reshard,LINUX,GPU,120,HYBRID,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,

From 6e8cd366529491305b153df21d235faadd81aec0 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:33:15 +0800
Subject: [PATCH 55/82] Fix some typos(infomation, unnecesary, etc) (#61956)

* Fix

* Fix
---
 test/cinn/op_mappers/op_mapper_test.py               |  8 ++++----
 test/cinn/op_mappers/test_norm_op.py                 |  2 +-
 test/cinn/test_paddle_model_convertor.py             |  4 ++--
 .../framework/no_need_buffer_vars_inference_test.cc  |  2 +-
 .../paddle2cinn/cinn_graph_symbolization_test.cc     |  4 ++--
 test/cpp/fluid/framework/program_utils_test.cc       |  4 ++--
 .../pattern_rewrite/drr_same_type_binding_test.cc    |  2 +-
 test/ir/inference/test_trt_convert_conv2d.py         | 12 +++++-------
 .../inference/test_trt_convert_conv2d_transpose.py   |  3 +--
 .../inference/test_trt_convert_conv3d_transpose.py   |  2 +-
 .../inference/test_trt_convert_depthwise_conv2d.py   |  2 +-
 .../test_trt_convert_depthwise_conv2d_transpose.py   |  3 +--
 .../test_trt_convert_elementwiseadd_transpose.py     |  2 +-
 test/ir/inference/test_trt_convert_expand_as_v2.py   |  2 +-
 test/ir/inference/test_trt_convert_expand_v2.py      |  4 ++--
 test/ir/inference/test_trt_ops_fp32_mix_precision.py |  2 +-
 .../ir/inference/test_xpu_convert_mixed_precision.py |  2 +-
 .../cinn/symbolic/test_cinn_broadcast_symbolic.py    |  2 +-
 .../cinn/symbolic/test_cinn_reduce_symbolic_demo.py  |  2 +-
 test/ir/pir/test_ir_pybind.py                        |  2 +-
 20 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/test/cinn/op_mappers/op_mapper_test.py b/test/cinn/op_mappers/op_mapper_test.py
index d77a1b4dc7bf0..f3a5ef5d1847b 100644
--- a/test/cinn/op_mappers/op_mapper_test.py
+++ b/test/cinn/op_mappers/op_mapper_test.py
@@ -127,7 +127,7 @@ def __set_paddle_op(self):
         self.skip_outputs = self.skip_check_outputs()
         # dict of inplace var
         self.inplace_outputs = self.set_inplace_outputs()
-        # collect some important infomation
+        # collect some important information
         self.input_arg_map = self.__get_arguments_map(self.inputs)
         self.fetch_targets = []
         self.skip_check_list = []
@@ -169,7 +169,7 @@ def __check_valid(self):
                 msg=f"The shape of input {var.name} in feed_data is error",
             )
             self.assertEqual(
-                self.paddleddtype2nptype(var.dtype),
+                self.paddledtype2nptype(var.dtype),
                 str(self.feed_data[name].dtype),
                 msg=f"The dtype of input {var.name} in feed_data is error",
             )
@@ -312,7 +312,7 @@ def build_cinn_program(self, target):
 
         for var_name, var in self.input_arg_map.items():
             convertor.create_input(
-                dtype=self.paddleddtype2nptype(var.dtype),
+                dtype=self.paddledtype2nptype(var.dtype),
                 shape=var.shape,
                 name=var_name,
             )
@@ -411,7 +411,7 @@ def get_program_vars(program) -> dict:
         return vars
 
     @staticmethod
-    def paddleddtype2nptype(dtype):
+    def paddledtype2nptype(dtype):
         switch_map = {
             paddle.float16: "float16",
             paddle.float32: "float32",
diff --git a/test/cinn/op_mappers/test_norm_op.py b/test/cinn/op_mappers/test_norm_op.py
index a163925f2bc63..aaed4637efe6a 100644
--- a/test/cinn/op_mappers/test_norm_op.py
+++ b/test/cinn/op_mappers/test_norm_op.py
@@ -59,7 +59,7 @@ def set_op_attrs(self):
         return {"axis": -1, "epsilon": 1e-10, "is_test": True}
 
     def skip_check_outputs(self):
-        # in test mode, 'Norm' is unnecesary
+        # in test mode, 'Norm' is unnecessary
         return {"Norm"}
 
 
diff --git a/test/cinn/test_paddle_model_convertor.py b/test/cinn/test_paddle_model_convertor.py
index dc878127013f5..0b2f3b15b36b6 100644
--- a/test/cinn/test_paddle_model_convertor.py
+++ b/test/cinn/test_paddle_model_convertor.py
@@ -132,7 +132,7 @@ def init_case(self):
                 msg="Repeat feed name: " + self.feed_names[i],
             )
 
-            dtype = self.paddleddtype2nptype(self.feed_dtypes[i])
+            dtype = self.paddledtype2nptype(self.feed_dtypes[i])
             # random int type data should not limited to [0, 1]
             high = 1 if ("int" not in dtype) else self.feed_shapes[i][0]
 
@@ -204,7 +204,7 @@ def build_cinn_program(self, target):
         convertor = PaddleModelConvertor(target)
         for i in range(len(self.feed_names)):
             convertor.create_input(
-                dtype=self.paddleddtype2nptype(self.feed_dtypes[i]),
+                dtype=self.paddledtype2nptype(self.feed_dtypes[i]),
                 shape=self.feed_data[self.feed_names[i]].shape,
                 name=self.feed_names[i],
             )
diff --git a/test/cpp/fluid/framework/no_need_buffer_vars_inference_test.cc b/test/cpp/fluid/framework/no_need_buffer_vars_inference_test.cc
index d31a9680c16ea..5d200324e435e 100644
--- a/test/cpp/fluid/framework/no_need_buffer_vars_inference_test.cc
+++ b/test/cpp/fluid/framework/no_need_buffer_vars_inference_test.cc
@@ -51,7 +51,7 @@ TEST(test_no_need_buffer_vars_inference, test_dygraph) {
 
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(TestNoNeedBufferVarsInferer, "X1", "X2");
 
-TEST(test_no_need_buffer_vars_inference, test_nullptr_comparation) {
+TEST(test_no_need_buffer_vars_inference, test_nullptr_comparison) {
   InferNoNeedBufferVarsFN infer_fn;
   ASSERT_FALSE(static_cast<bool>(infer_fn));
   ASSERT_TRUE(!infer_fn);
diff --git a/test/cpp/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/test/cpp/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
index b936c0dfd5975..335f8a53da895 100644
--- a/test/cpp/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
+++ b/test/cpp/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc
@@ -265,7 +265,7 @@ TEST_F(CinnGraphSymbolizationTest, scope) {
   ASSERT_EQ(cinn_tensor->type(), ::cinn::common::F32());
 }
 
-TEST_F(CinnGraphSymbolizationTest, sortgraph) {
+TEST_F(CinnGraphSymbolizationTest, sort_graph) {
   auto cinn_op_descs = test_->TransformAllGraphOpToCinn();
   ASSERT_FALSE(cinn_op_descs.empty());
   std::vector<std::string> sort_names;
@@ -276,7 +276,7 @@ TEST_F(CinnGraphSymbolizationTest, sortgraph) {
             std::vector<std::string>({"feed", "feed", "mul", "add", "relu"}));
 }
 
-TEST_F(CinnGraphSymbolizationTest, runop) {
+TEST_F(CinnGraphSymbolizationTest, run_op) {
   auto cinn_op_descs = test_->TransformAllGraphOpToCinn();
   auto feed_map = test_->GetFeedInfoMapFromInput();
 
diff --git a/test/cpp/fluid/framework/program_utils_test.cc b/test/cpp/fluid/framework/program_utils_test.cc
index 051aa89e4b5f3..624c5697e537b 100644
--- a/test/cpp/fluid/framework/program_utils_test.cc
+++ b/test/cpp/fluid/framework/program_utils_test.cc
@@ -203,8 +203,8 @@ TEST(ProgramDesc, GetInputsOutputsInBlock) {
   ASSERT_EQ(5UL, inner_inputs.size());
   ASSERT_EQ(2UL, inner_outputs.size());
 
-  // varible "Less_than_2_Out" is the input of cond_op, it also is the output of
-  // less_than_op.
+  // variable "Less_than_2_Out" is the input of cond_op, it also is the output
+  // of less_than_op.
   std::set<std::string> inner_inputs_{"Less_than_2_Out",
                                       "Less_than_2_X",
                                       "Less_than_2_Y",
diff --git a/test/cpp/pir/pattern_rewrite/drr_same_type_binding_test.cc b/test/cpp/pir/pattern_rewrite/drr_same_type_binding_test.cc
index d672cc4ae9c13..bf8f847b2a877 100644
--- a/test/cpp/pir/pattern_rewrite/drr_same_type_binding_test.cc
+++ b/test/cpp/pir/pattern_rewrite/drr_same_type_binding_test.cc
@@ -30,7 +30,7 @@
                                     /  |  \  \  \
                                   /    |   \   \    \
              full               /      |    |    \     \           full_tmp
-            /  |        transpose1      | trans2 trans3    \         /   |
+            /  |        transpose1      | trans2 trans3    \        /   |
            /   |         /    |        |    |      |        \      /    |
     softmax1   |        /     |        |    |      |          \   /     |
          \     |      /    softmax2    |    |      |          add1      |
diff --git a/test/ir/inference/test_trt_convert_conv2d.py b/test/ir/inference/test_trt_convert_conv2d.py
index 84b44adc574ef..3fa99a078ddd7 100644
--- a/test/ir/inference/test_trt_convert_conv2d.py
+++ b/test/ir/inference/test_trt_convert_conv2d.py
@@ -65,7 +65,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
         strides_options = [[2, 2], [1, 2]]
         paddings_options = [[0, 3], [1, 2, 3, 4]]
         groups_options = [1, 3]
-        padding_altorithm_options = ['EXPLICIT', 'SAME', 'VALID']
+        padding_algorithm_options = ['EXPLICIT', 'SAME', 'VALID']
         dilations_options = [[1, 2]]
         data_format_options = ['NCHW']
 
@@ -74,7 +74,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
             strides_options,
             paddings_options,
             groups_options,
-            padding_altorithm_options,
+            padding_algorithm_options,
             dilations_options,
             data_format_options,
         ]
@@ -90,7 +90,6 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
         ) in itertools.product(*configurations):
             attrs = [
                 {
-                    "data_fromat": data_format,
                     "dilations": dilations,
                     "padding_algorithm": padding_algorithm,
                     "groups": groups,
@@ -252,7 +251,7 @@ def generate_data(attrs: List[Dict[str, Any]]):
         strides_options = [[2, 2]]
         paddings_options = [[1, 1]]
         groups_options = [1]
-        padding_altorithm_options = ['EXPLICIT']
+        padding_algorithm_options = ['EXPLICIT']
         dilations_options = [[1, 1]]
         data_format_options = ['NCHW']
 
@@ -263,7 +262,7 @@ def generate_data(attrs: List[Dict[str, Any]]):
             strides_options,
             paddings_options,
             groups_options,
-            padding_altorithm_options,
+            padding_algorithm_options,
             dilations_options,
             data_format_options,
         ]
@@ -282,14 +281,13 @@ def generate_data(attrs: List[Dict[str, Any]]):
             ic = input_shape[1]
             attrs = [
                 {
-                    "data_fromat": data_format,
                     "dilations": dilations,
                     "padding_algorithm": padding_algorithm,
                     "groups": groups,
                     "paddings": paddings,
                     "strides": strides,
                     "data_format": data_format,
-                    # below attrs are used for my convience.
+                    # below attrs are used for my convenience.
                     "input_shape": input_shape,
                     "weight_shape": [
                         oc,
diff --git a/test/ir/inference/test_trt_convert_conv2d_transpose.py b/test/ir/inference/test_trt_convert_conv2d_transpose.py
index 57c973b466732..c493377ebf976 100644
--- a/test/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/test/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -80,7 +80,6 @@ def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
                                         self.num_channels = num_channels
                                         dics = [
                                             {
-                                                "data_fromat": data_format,
                                                 "dilations": dilations,
                                                 "padding_algorithm": padding_algorithm,
                                                 "groups": groups,
@@ -272,7 +271,7 @@ def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
         self.num_channels = num_channels
         dics = [
             {
-                "data_fromat": 'NCHW',
+                "data_format": 'NCHW',
                 "dilations": [1, 1],
                 "padding_algorithm": 'EXPLICIT',
                 "groups": 1,
diff --git a/test/ir/inference/test_trt_convert_conv3d_transpose.py b/test/ir/inference/test_trt_convert_conv3d_transpose.py
index ba545275d805b..b2d15d3643f57 100644
--- a/test/ir/inference/test_trt_convert_conv3d_transpose.py
+++ b/test/ir/inference/test_trt_convert_conv3d_transpose.py
@@ -48,7 +48,7 @@ def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
         self.num_channels = num_channels
         dics = [
             {
-                "data_fromat": 'NCHW',
+                "data_format": 'NCHW',
                 "dilations": [1, 1, 1],
                 "padding_algorithm": 'EXPLICIT',
                 "groups": 1,
diff --git a/test/ir/inference/test_trt_convert_depthwise_conv2d.py b/test/ir/inference/test_trt_convert_depthwise_conv2d.py
index 5a38347d8e646..5fd25562f33a9 100644
--- a/test/ir/inference/test_trt_convert_depthwise_conv2d.py
+++ b/test/ir/inference/test_trt_convert_depthwise_conv2d.py
@@ -84,7 +84,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
                     "groups": groups,
                     "padding_algorithm": padding_algorithm,
                     "dilations": dilations,
-                    "data_fromat": data_format,
+                    "data_format": data_format,
                 }
             ]
 
diff --git a/test/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/test/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
index e570fe77f1c2e..4a0b0faf4df65 100644
--- a/test/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/test/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -87,13 +87,12 @@ def generate_weight1(attrs: list[dict[str, Any]]):
         ):
             dics = [
                 {
-                    "data_fromat": data_format,
+                    "data_format": data_format,
                     "dilations": dilations,
                     "padding_algorithm": padding_algorithm,
                     "groups": groups,
                     "paddings": paddings,
                     "strides": strides,
-                    "data_format": data_format,
                     "output_size": [],
                     "output_padding": [],
                 }
diff --git a/test/ir/inference/test_trt_convert_elementwiseadd_transpose.py b/test/ir/inference/test_trt_convert_elementwiseadd_transpose.py
index dd9c495d49772..012aaa00241b4 100644
--- a/test/ir/inference/test_trt_convert_elementwiseadd_transpose.py
+++ b/test/ir/inference/test_trt_convert_elementwiseadd_transpose.py
@@ -22,7 +22,7 @@
 import paddle.inference as paddle_infer
 
 
-class TrtConvertElementwiseaddTransposeTest(TrtLayerAutoScanTest):
+class TrtConvertElementwiseAddTransposeTest(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
diff --git a/test/ir/inference/test_trt_convert_expand_as_v2.py b/test/ir/inference/test_trt_convert_expand_as_v2.py
index 50bb2ac34ca27..16c689573eeaa 100644
--- a/test/ir/inference/test_trt_convert_expand_as_v2.py
+++ b/test/ir/inference/test_trt_convert_expand_as_v2.py
@@ -247,7 +247,7 @@ def clear_dynamic_shape():
         generate_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         program_config.set_input_type(np.float32)
-        # fill_constant will be folded by constnt folding pass!
+        # fill_constant will be folded by constant folding pass!
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         program_config.set_input_type(np.float16)
diff --git a/test/ir/inference/test_trt_convert_expand_v2.py b/test/ir/inference/test_trt_convert_expand_v2.py
index 0961dac427699..03d8dd32df6b5 100644
--- a/test/ir/inference/test_trt_convert_expand_v2.py
+++ b/test/ir/inference/test_trt_convert_expand_v2.py
@@ -253,7 +253,7 @@ def clear_dynamic_shape():
         generate_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         program_config.set_input_type(np.float32)
-        # fill_constant will be folded by constnt folding pass!
+        # fill_constant will be folded by constant folding pass!
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         program_config.set_input_type(np.float16)
@@ -399,7 +399,7 @@ def clear_dynamic_shape():
         generate_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         program_config.set_input_type(np.float32)
-        # fill_constant will be folded by constnt folding pass!
+        # fill_constant will be folded by constant folding pass!
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         program_config.set_input_type(np.float16)
diff --git a/test/ir/inference/test_trt_ops_fp32_mix_precision.py b/test/ir/inference/test_trt_ops_fp32_mix_precision.py
index 3347cfb6d06db..6a7a6051dea61 100644
--- a/test/ir/inference/test_trt_ops_fp32_mix_precision.py
+++ b/test/ir/inference/test_trt_ops_fp32_mix_precision.py
@@ -56,7 +56,7 @@ def generate_elementwise_weight(op_type):
 
         attrs = [
             {
-                "data_fromat": 'NCHW',
+                "data_format": 'NCHW',
                 "dilations": [1, 2],
                 "padding_algorithm": 'EXPLICIT',
                 "groups": 1,
diff --git a/test/ir/inference/test_xpu_convert_mixed_precision.py b/test/ir/inference/test_xpu_convert_mixed_precision.py
index f09d00440ac64..cce33ca3bc9dc 100644
--- a/test/ir/inference/test_xpu_convert_mixed_precision.py
+++ b/test/ir/inference/test_xpu_convert_mixed_precision.py
@@ -27,7 +27,7 @@
 from paddle.vision.models import resnet50
 
 
-class ConvertMixedPrecison(unittest.TestCase):
+class ConvertMixedPrecision(unittest.TestCase):
     def test(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         model = resnet50(True)
diff --git a/test/ir/pir/cinn/symbolic/test_cinn_broadcast_symbolic.py b/test/ir/pir/cinn/symbolic/test_cinn_broadcast_symbolic.py
index 63009a9704d7c..96f8fbfebd24b 100644
--- a/test/ir/pir/cinn/symbolic/test_cinn_broadcast_symbolic.py
+++ b/test/ir/pir/cinn/symbolic/test_cinn_broadcast_symbolic.py
@@ -74,7 +74,7 @@ def eval_symbolic(self, use_cinn):
             self.check_jit_kernel_info(net.forward)
         return out
 
-    def test_eval_symolic(self):
+    def test_eval_symbolic(self):
         cinn_out = self.eval_symbolic(use_cinn=True)
         dy_out = self.eval_symbolic(use_cinn=False)
         np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-8)
diff --git a/test/ir/pir/cinn/symbolic/test_cinn_reduce_symbolic_demo.py b/test/ir/pir/cinn/symbolic/test_cinn_reduce_symbolic_demo.py
index bb2e1c789e22f..dede8a2083efc 100644
--- a/test/ir/pir/cinn/symbolic/test_cinn_reduce_symbolic_demo.py
+++ b/test/ir/pir/cinn/symbolic/test_cinn_reduce_symbolic_demo.py
@@ -70,7 +70,7 @@ def eval_symbolic(self, use_cinn):
 
         return out
 
-    def test_eval_symolic(self):
+    def test_eval_symbolic(self):
         cinn_out = self.eval_symbolic(use_cinn=True)
         # dy_out = self.eval_symbolic(use_cinn=False)
         # np.testing.assert_allclose(cinn_out.numpy(), dy_out.numpy(), atol=1e-8)
diff --git a/test/ir/pir/test_ir_pybind.py b/test/ir/pir/test_ir_pybind.py
index 22e03fdc1bf95..460e5e489eb35 100644
--- a/test/ir/pir/test_ir_pybind.py
+++ b/test/ir/pir/test_ir_pybind.py
@@ -71,7 +71,7 @@ def test_operation(self):
         self.assertEqual(len(matmul_op.get_input_names()), 2)
         self.assertEqual(len(matmul_op.get_attr_names()), 2)
         self.assertEqual(len(matmul_op.get_output_names()), 1)
-        # test oprand.index
+        # test operand.index
         self.assertEqual(matmul_op.operand(0).index(), 0)
         self.assertEqual(matmul_op.operand(1).index(), 1)
         self.assertEqual(add_op.operand(0).index(), 0)

From e4518a8b00d3a7f1505bd7dd7b10c337e4d1383b Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:35:37 +0800
Subject: [PATCH 56/82] Update skip_files.py (#62004)

---
 python/paddle/jit/sot/opcode_translator/skip_files.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/skip_files.py b/python/paddle/jit/sot/opcode_translator/skip_files.py
index ca7f3552ad6ac..f3517d1419c38 100644
--- a/python/paddle/jit/sot/opcode_translator/skip_files.py
+++ b/python/paddle/jit/sot/opcode_translator/skip_files.py
@@ -54,7 +54,7 @@
 
 import paddle
 
-NEED_SKIP_THIRD_PARTIY_MODULES = {
+NEED_SKIP_THIRD_PARTY_MODULES = {
     abc,
     collections,
     contextlib,
@@ -98,13 +98,13 @@
     import sre_compile
     import sre_parse
 
-    NEED_SKIP_THIRD_PARTIY_MODULES.add(sre_compile)
-    NEED_SKIP_THIRD_PARTIY_MODULES.add(sre_parse)
+    NEED_SKIP_THIRD_PARTY_MODULES.add(sre_compile)
+    NEED_SKIP_THIRD_PARTY_MODULES.add(sre_parse)
 
 if sys.version_info < (3, 12):
     import distutils
 
-    NEED_SKIP_THIRD_PARTIY_MODULES.add(distutils)
+    NEED_SKIP_THIRD_PARTY_MODULES.add(distutils)
 
 
 def _strip_init_py(s):
@@ -115,7 +115,7 @@ def _module_dir(m: types.ModuleType):
     return _strip_init_py(m.__file__)
 
 
-skip_file_names = {_module_dir(m) for m in NEED_SKIP_THIRD_PARTIY_MODULES}
+skip_file_names = {_module_dir(m) for m in NEED_SKIP_THIRD_PARTY_MODULES}
 
 
 sot_path = os.path.dirname(__file__).rpartition(os.sep)[0] + os.sep

From 5508f55d6e0861d2ee4ff87f847c5e6e3b0f8e39 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:36:04 +0800
Subject: [PATCH 57/82] Fix typos(attetion, etc) (#62044)

---
 .../ir/mkldnn/cpu_quantize_squash_pass_tester.cc          | 2 +-
 .../fluid/framework/ir/mkldnn/self_attention_fuse_pass.h  | 2 +-
 .../ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc       | 2 +-
 .../ir/multi_devices_graph_pass/all_reduce_deps_pass.cc   | 4 ++--
 .../backward_optimizer_op_deps_pass.cc                    | 2 +-
 .../multi_devices_graph_check_pass.cc                     | 6 +++---
 .../multi_devices_graph_print_pass.cc                     | 4 ++--
 .../fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc | 2 +-
 .../ir/xpu/fused_multi_transformer_int8_xpu_quant_pass.cc | 2 +-
 .../ir/xpu/multi_encoder_xpu_adaptive_seqlen_fuse_pass.h  | 8 ++++----
 10 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 5b1cd5fe87aed..90ed3009749ad 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -1018,7 +1018,7 @@ TEST(CpuQuantizeSquashPass, fc_dequant_more_than_one_op_after_dequant) {
 
 // a->Concat1->b
 // b->Concat2
-// b->Quatize1(Scale)->c
+// b->Quantize1(Scale)->c
 // c->Fc1
 // c->Fc2
 TEST(CpuQuantizeSquashPass, quatize_with_same_scale) {
diff --git a/paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.h
index ade48f398e3b6..a264795bd78fb 100644
--- a/paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/self_attention_fuse_pass.h
@@ -24,7 +24,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-// Fusing of self-attetion structure
+// Fusing of self-attention structure
 
 class Graph;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
index 5cf9a973061f0..764712a2fcd8a 100644
--- a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
@@ -174,7 +174,7 @@ void ShuffleChannelMKLDNNDetectPass::ApplyImpl(ir::Graph* graph) const {
         }
       }
 
-      // shuffle_channel dosen't change shape
+      // shuffle_channel doesn't change shape
       if ((reshape2_shape[0] != -1) && (x_shape1[0] != reshape2_shape[0])) {
         return;
       }
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
index 0dcf316c33c69..6327929663ab4 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
@@ -150,7 +150,7 @@ class AllReduceDepsPass : public ir::Pass {
       const std::vector<details::OpHandleBase*>& all_reduce_op_handles) const {
     // get vars order
     std::map<int, std::vector<std::string>> vars =
-        GetSoredGradientsFromStaleProgram(graph);
+        GetSortedGradientsFromStaleProgram(graph);
     std::stringstream out;
     size_t grads_of_stale_program = 0;
     out << "Get Order From details::kStaleProgramOpDescs: ";
@@ -188,7 +188,7 @@ class AllReduceDepsPass : public ir::Pass {
     }
   }
 
-  std::map<int, std::vector<std::string>> GetSoredGradientsFromStaleProgram(
+  std::map<int, std::vector<std::string>> GetSortedGradientsFromStaleProgram(
       const ir::Graph& graph) const {
     std::map<int, std::vector<std::string>> vars;
     auto ops =
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
index 82e8dada11556..a4feed4693a62 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
@@ -68,7 +68,7 @@ class BackWardOpDepsPass : public ir::Pass {
       AddDep(graph, opt_handles[i - 1], opt_handles[i]);
     }
 
-    VLOG(10) << "add deps between backward and optimze:";
+    VLOG(10) << "add deps between backward and optimize:";
     AddDep(graph,
            backward_op_handles[backward_op_handles.size() - 1],
            opt_handles[0]);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
index d08bbc1720de6..95f64e896f77b 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
@@ -19,14 +19,14 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class SSAGraghBuilderWithChecker : public ir::Pass {
+class SSAGraphBuilderWithChecker : public ir::Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override {
     PADDLE_ENFORCE_EQ(
         IsValidGraph(graph),
         true,
         platform::errors::InvalidArgument(
-            "In SSAGraghBuilderWithChecker, invalid Graph input."));
+            "In SSAGraphBuilderWithChecker, invalid Graph input."));
   }
 
   bool IsValidGraph(const ir::Graph *graph) const {
@@ -99,6 +99,6 @@ class SSAGraghBuilderWithChecker : public ir::Pass {
 }  // namespace paddle
 
 REGISTER_PASS(multi_devices_check_pass,
-              paddle::framework::ir::SSAGraghBuilderWithChecker)
+              paddle::framework::ir::SSAGraphBuilderWithChecker)
     .RequireGraphAttr(paddle::framework::details::kGraphVars)
     .RequireGraphAttr(paddle::framework::details::kGraphDepVars);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
index 97454b7583de2..6005c7de5c551 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
@@ -19,7 +19,7 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class SSAGraghBuilderWithPrinterPass : public ir::Pass {
+class SSAGraphBuilderWithPrinterPass : public ir::Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override {
     std::unique_ptr<std::ostream> fout(
@@ -102,5 +102,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph,
 }  // namespace paddle
 
 REGISTER_PASS(multi_devices_print_pass,
-              paddle::framework::ir::SSAGraghBuilderWithPrinterPass)
+              paddle::framework::ir::SSAGraphBuilderWithPrinterPass)
     .RequirePassAttr(paddle::framework::ir::kGraphvizPath);
diff --git a/paddle/fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc
index 5db915c4e17f5..55e8786f73c4d 100644
--- a/paddle/fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/add_layernorm_xpu_fuse_pass.cc
@@ -59,7 +59,7 @@ After the pass is applied:
    scale---- add_layernorm_fusion ---- bias
                 /     |    \     \
                /      |     \     \
-          variance    |      meam  z_add
+          variance    |      mean  z_add
                     Output
 */
 struct AddLayernormXPUPattern : public PatternBase {
diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_int8_xpu_quant_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_int8_xpu_quant_pass.cc
index 4f9af98495c37..7c5b7c9e5e4e7 100755
--- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_int8_xpu_quant_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_int8_xpu_quant_pass.cc
@@ -525,7 +525,7 @@ int FusedMultiTransformerInt8XPUQuantPass::FusedMultiTransformerInt8(
         id++;
       }
     };
-    // genereate input node
+    // generate input node
     attr2weight(
         "qkv_in_scale", &(input_max_nodes_vec[0]), &(input_max_names_vec[0]));
     attr2weight("out_linear_in_scale",
diff --git a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_adaptive_seqlen_fuse_pass.h b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_adaptive_seqlen_fuse_pass.h
index a21d6498dea8e..22910c2120530 100644
--- a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_adaptive_seqlen_fuse_pass.h
+++ b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_adaptive_seqlen_fuse_pass.h
@@ -40,7 +40,7 @@ class MultiEncoderXPUAdaptiveSeqlenFusePass : public FusePassBase {
   /*
   adaptive seqlen V1, before:
 
-      inpu_var*     mask_var*
+      input_var*     mask_var*
           |             |
           |             |
     embedding_xpu     matmul
@@ -59,7 +59,7 @@ class MultiEncoderXPUAdaptiveSeqlenFusePass : public FusePassBase {
 
   after:
 
-        inpu_var*    mask_var*
+        input_var*    mask_var*
           \             /
            \           /
           embedding_xpu
@@ -81,7 +81,7 @@ class MultiEncoderXPUAdaptiveSeqlenFusePass : public FusePassBase {
   /*
   adaptive seqlen V2, before:
 
-      inpu_var*          mask_var*
+      input_var*          mask_var*
           |                 |
           |                 |
     embedding_xpu        not_equal
@@ -115,7 +115,7 @@ class MultiEncoderXPUAdaptiveSeqlenFusePass : public FusePassBase {
 
   after:
 
-        inpu_var*    mask_var*
+        input_var*    mask_var*
           \             /
            \           /
           embedding_xpu

From 67bd48f9724735eaadddc71d8dd934a7972de470 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:36:31 +0800
Subject: [PATCH 58/82]  Fix some typos(ingest_externel_file, etc) (#62040)

---
 .../auto_parallel/spmd_rules/replicated_spmd_rule.cc          | 2 +-
 paddle/fluid/distributed/ps/service/server.h                  | 2 +-
 paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h   | 2 +-
 paddle/fluid/distributed/ps/table/ssd_sparse_table.cc         | 4 ++--
 paddle/fluid/distributed/ps/wrapper/fleet.cc                  | 4 ++--
 paddle/fluid/distributed/ps/wrapper/fleet.h                   | 2 +-
 paddle/fluid/distributed/test/graph_node_test.cc              | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.cc
index b0ffb47c99234..5227a82a4b8b5 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.cc
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.cc
@@ -30,7 +30,7 @@ ReplicatedSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   }
 
   // TODO(ljz): we need to know num of output and size of each output before
-  // generate the excat replicated dist tensor attr for the current op.
+  // generate the exact replicated dist tensor attr for the current op.
   // here we just assume that only one output tensor and has the same size as
   // the first input tensor.
   return {intput_dist_attrs, {ReplicatedOnMesh(input_specs[0].dist_attr())}};
diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h
index 5a0764b11e8a1..bae9ab652ff74 100644
--- a/paddle/fluid/distributed/ps/service/server.h
+++ b/paddle/fluid/distributed/ps/service/server.h
@@ -194,7 +194,7 @@ class PsBaseService : public PsService {
                                  const char *err_msg) {
     response.set_err_msg(err_msg);
     response.set_err_code(err_code);
-    LOG(WARNING) << "Resonse err_code:" << err_code << " msg:" << err_msg;
+    LOG(WARNING) << "Response err_code:" << err_code << " msg:" << err_msg;
   }
 
   virtual int32_t Initialize() = 0;
diff --git a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
index 873644f8ca416..b090ef778d2ac 100644
--- a/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
+++ b/paddle/fluid/distributed/ps/table/depends/rocksdb_warpper.h
@@ -237,7 +237,7 @@ class RocksDBHandler {
 
   Uint64Comparator* get_comparator() { return &_comparator; }
 
-  int ingest_externel_file(int id,
+  int ingest_external_file(int id,
                            const std::vector<std::string>& sst_filelist) {
     rocksdb::IngestExternalFileOptions ifo;
     ifo.move_files = true;
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 13577bde3e145..d72b4ee1c3d3f 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -2896,7 +2896,7 @@ int32_t SSDSparseTable::LoadWithBinary(const std::string& path, int param) {
     auto sst_filelist = _afs_client.list(::paddle::string::format_string(
         "%s_%d/part-*", FLAGS_rocksdb_path.c_str(), shard_idx));
     if (!sst_filelist.empty()) {
-      int ret = _db->ingest_externel_file(shard_idx, sst_filelist);
+      int ret = _db->ingest_external_file(shard_idx, sst_filelist);
       if (ret) {
         VLOG(0) << "ingest file failed";
         abort();
@@ -3038,7 +3038,7 @@ int32_t SSDSparseTable::CacheTable(uint16_t pass_id) {
               }
               VLOG(0) << "write sst_file shard " << shard_id << ": "
                       << butil::gettimeofday_ms() - show_begin << " ms";
-              int ret = _db->ingest_externel_file(shard_id, {filename});
+              int ret = _db->ingest_external_file(shard_id, {filename});
               if (ret) {
                 VLOG(0) << "ingest file failed"
                         << ", " << status.getState();
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index ac404f5d9e70c..44043bc65501c 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -918,8 +918,8 @@ std::default_random_engine& FleetWrapper::LocalRandomEngine() {
       clock_gettime(CLOCK_REALTIME, &tp);
       double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9;
       static std::atomic<uint64_t> x(0);
-      std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)};
-      engine.seed(sseq);
+      std::seed_seq s_seq = {x++, x++, x++, (uint64_t)(cur_time * 1000)};
+      engine.seed(s_seq);
     }
   };
   thread_local engine_wrapper_t r;
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.h b/paddle/fluid/distributed/ps/wrapper/fleet.h
index 208e94ec12102..95504ede00fad 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.h
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.h
@@ -288,7 +288,7 @@ class FleetWrapper {
 
   std::string GetDistDesc() const {
     CHECK(is_initialized_ == true)
-        << "fleetwrapper should be initialized first!!!";
+        << "FleetWrapper should be initialized first!!!";
     return dist_desc_;
   }
 
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 26207a9ad8c9e..8c29c2bf1df3f 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -416,7 +416,7 @@ void RunBrpcPushSparse() {
   // auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1);
   // host_sign_list_.push_back(ph_host2.SerializeToString());
   // // test-end
-  // // Srart Server
+  // // Start Server
   // std::thread* server_thread = new std::thread(RunServer);
   // std::thread* server_thread2 = new std::thread(RunServer2);
   // sleep(1);

From 59980808241e7432de2ba6dd43444908bce37208 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:39:21 +0800
Subject: [PATCH 59/82]  Fix typos(oprations, etc) (#62043)

---
 paddle/fluid/framework/ir/fusion_group/operation.cc    |  2 +-
 paddle/fluid/framework/ir/fusion_group/operation.h     |  2 +-
 paddle/fluid/framework/ir/fusion_group/subgraph.h      |  2 +-
 .../fluid/framework/ir/ipu/optimizer_extract_pass.cc   | 10 +++++-----
 .../buffer_shared_cross_op_memory_reuse_pass.cc        |  4 ++--
 .../memory_optimization_var_info.h                     |  2 +-
 .../share_varinfo_into_cinn_pass.cc                    |  2 +-
 .../while_op_eager_deletion_pass.cc                    |  2 +-
 .../ir/mkldnn/cpu_quantize_squash_pass_tester.cc       |  4 ++--
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index 908aa6d62b6f7..75b0d8d631f8a 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -152,7 +152,7 @@ void OperationMap::InsertUnaryElementwiseOperations() {
 }
 
 void OperationMap::InsertBinaryElementwiseOperations() {
-  // For binary elementwise oprations:
+  // For binary elementwise operations:
   //  ${0} - x
   //  ${1} - y
   //  ${2} - out
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.h b/paddle/fluid/framework/ir/fusion_group/operation.h
index 3edf2f598525a..dd939cd3cbbf1 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.h
+++ b/paddle/fluid/framework/ir/fusion_group/operation.h
@@ -55,7 +55,7 @@ struct Operation {
       return false;
     }
     if (IsGradOp() && exprs.size() != static_cast<size_t>(num_operands)) {
-      // When it is a backward opertion, it should hold a expression for each
+      // When it is a backward operation, it should hold a expression for each
       // operand.
       return false;
     }
diff --git a/paddle/fluid/framework/ir/fusion_group/subgraph.h b/paddle/fluid/framework/ir/fusion_group/subgraph.h
index 057fc7efffb30..97caa43249002 100644
--- a/paddle/fluid/framework/ir/fusion_group/subgraph.h
+++ b/paddle/fluid/framework/ir/fusion_group/subgraph.h
@@ -150,7 +150,7 @@ class SubGraph {
           !IsInputOfExternalOp(n)) {
         // When the outputs size is 0, it is also considered a intermidiate
         // output. It maybe an unused output or the fetching vars, so that we
-        // cannot eleiminate it directly here.
+        // cannot eliminate it directly here.
         intermediate_out_vars.push_back(n);
       }
     }
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 284d144bf7534..d09519dfa5b04 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -100,8 +100,8 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
       // bool is_optimizer = is_optimizer_op(op_namescope);
       bool is_regularization = is_regularization_op(op_namescope);
 
-      VLOG(10) << "found optimizer releated op: " << op_type;
-      // initial larning_rate will be set in ipu_backend
+      VLOG(10) << "found optimizer related op: " << op_type;
+      // initial learning_rate will be set in ipu_backend
       set_ops.insert(op_type);
       if (op_type == "sgd") {
         auto type = std::string{"sgd"};
@@ -267,10 +267,10 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
         auto value = PADDLE_GET_CONST(float, op->GetAttr("value"));
         new_op.SetAttr("clip_norm", value);
       } else if (ignored_ops.count(op_type)) {
-        VLOG(10) << "Ignore optimizer releated op: " << op_type;
+        VLOG(10) << "Ignore optimizer related op: " << op_type;
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unknown optimizer releated op_type: %s", op_type));
+            "Unknown optimizer related op_type: %s", op_type));
       }
     } else if (op_role == OpRole::kLoss) {
       VLOG(10) << "found loss op type: " << op->Type();
@@ -312,7 +312,7 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
       new_op.SetAttr("weight_decay_mode", std::string{"l2_regularization"});
     }
   } else {
-    VLOG(10) << "No weight deacy setting found";
+    VLOG(10) << "No weight decay setting found";
   }
 
   // setup grad clip
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
index 2656580228049..b41b76c99aff6 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
@@ -245,7 +245,7 @@ void BufferSharedCrossOpMemoryReusePass::RunOnScopeIdx(size_t idx) const {
       }
     }
 
-    // After all output args have been transversed, we should check whether
+    // After all output args have been traversed, we should check whether
     // there is new unlived var after `op` runs.
     for (auto op_iter = var_to_ops.begin(); op_iter != var_to_ops.end();) {
       // erase op from `var_to_ops` first
@@ -355,7 +355,7 @@ void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
   // BFS to fill `preceding_ops`
   graph_view.BreadthFirstVisit([&](OpHandleBase *cur_op) {
     // All preceding ops of cur_op should be:
-    //  - preceding ops of cur_op, that is connected to cur_op directely
+    //  - preceding ops of cur_op, that is connected to cur_op directly
     //  - all preceding ops of `direct preceding ops of cur_op`
     auto &all_preceding_ops_of_cur_op = preceding_ops[cur_op];
     for (auto &preceding_op : graph_view.PrecedingOps(cur_op)) {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
index 2980fa4e34a81..38238d8c7c307 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
@@ -89,7 +89,7 @@ class MemOptVarInfo {
    * scheduled in many threads inside ParallelExecutor, runtime_ref_cnt_
    * must be an atomic integer to guarantee the thread safety and visibility.
    *
-   * Speciallly, if ref_cnt_ is 1, we do not need to reset runtime_ref_cnt_
+   * Specially, if ref_cnt_ is 1, we do not need to reset runtime_ref_cnt_
    * after iteration ends.
    */
   size_t ref_cnt_;
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
index 2bc3d839af549..d9ea00e3935cc 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc
@@ -111,7 +111,7 @@ static void TakeVarInfoFromMainGraph(
 }
 
 // This pass will be applied on both the main graph and all cinn subgraphs,
-// and it distinguishs them according to whether the graph has the
+// and it distinguishes them according to whether the graph has the
 // kMemOptVarInfoFromMainGraph attribute or not.
 // On the main graph, it finds all cinn_launch ops and shares MemOptVarInfos
 // to their subgraphs.
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc
index 42f395da7c8a8..2d26587fdb24f 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc
@@ -55,7 +55,7 @@ class WhileOpEagerDeletionPass : public ir::Pass {
       }
     }
     if (graph->IsConstructedByPartialProgram()) {
-      VLOG(4) << "Is Paritial Program";
+      VLOG(4) << "Is Partial Program";
       PADDLE_ENFORCE_LE(
           target_ops.size(),
           1,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 90ed3009749ad..d2c6d981c3a2e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -1021,7 +1021,7 @@ TEST(CpuQuantizeSquashPass, fc_dequant_more_than_one_op_after_dequant) {
 // b->Quantize1(Scale)->c
 // c->Fc1
 // c->Fc2
-TEST(CpuQuantizeSquashPass, quatize_with_same_scale) {
+TEST(CpuQuantizeSquashPass, quantize_with_same_scale) {
   auto first_scale = 1.2345f;
   auto second_scale = 1.2345f;
   auto use_mkldnn = true;
@@ -1033,7 +1033,7 @@ TEST(CpuQuantizeSquashPass, quatize_with_same_scale) {
 }
 
 // if scales are not the same, do not fuse
-TEST(CpuQuantizeSquashPass, quatize_with_different_scale) {
+TEST(CpuQuantizeSquashPass, quantize_with_different_scale) {
   auto first_scale = 1.2345f;
   auto second_scale = 1.5432f;
   auto use_mkldnn = true;

From 1e8a77cfa5452a92491e68052032c87104a07135 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:40:12 +0800
Subject: [PATCH 60/82] Fix some typos(faied, etc) (#62042)

---
 paddle/fluid/framework/fleet/gloo_wrapper.cc        |  2 +-
 paddle/fluid/framework/fleet/gloo_wrapper.h         |  2 +-
 .../fluid/framework/fleet/heter_ps/feature_value.h  |  6 +++---
 .../framework/fleet/heter_ps/gpu_graph_utils.h      |  2 +-
 paddle/fluid/framework/fleet/heter_ps/heter_comm.h  |  2 +-
 paddle/fluid/framework/io/crypto/aes_cipher.cc      | 13 -------------
 paddle/fluid/framework/io/crypto/aes_cipher.h       | 13 -------------
 .../framework/ir/fusion_group/code_generator.cc     |  2 +-
 .../framework/ir/fusion_group/fusion_group_pass.cc  |  2 +-
 9 files changed, 9 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index 140de78bf01dc..277004b6dc164 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -141,7 +141,7 @@ std::vector<char> HdfsStore::get(const std::string& key) {
   PADDLE_ENFORCE_EQ(read_status,
                     0,
                     paddle::platform::errors::Fatal(
-                        "HdfsStore::get, path read faied: " + path));
+                        "HdfsStore::get, path read failed: " + path));
 #endif
   return result;
 }
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h
index b1f236fb60548..fa352fb8eb99f 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.h
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.h
@@ -95,7 +95,7 @@ class ParallelConnectContext : public gloo::rendezvous::Context {
       : gloo::rendezvous::Context(rank, size, base) {}
   virtual ~ParallelConnectContext() {}
   // in gloo::rendezvous::Context wait&get one by one,
-  // slowly in case big size, especialy in HdfsStore
+  // slowly in case big size, especially in HdfsStore
   void connectFullMesh(Store& store,                              // NOLINT
                        std::shared_ptr<transport::Device>& dev);  // NOLINT
   struct Impl {
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index 7cdb20bb978bc..0ab35f4a2beca 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -75,7 +75,7 @@ class CommonFeatureValueAccessor {
     __host__ __device__ int EmbedDim() const { return embed_sgd_dim; }
     __host__ __device__ int EmbedXDim() const { return embedx_sgd_dim; }
     __host__ __device__ int EmbedWDim() const { return embedx_dim; }
-    __host__ __device__ int CpuPtrIndex() const { return 0; }  // cpuprt uint64
+    __host__ __device__ int CpuPtrIndex() const { return 0; }  // cpu_ptr uint64
     __host__ __device__ int DeltaScoreIndex() const {
       return CpuPtrIndex() + 2;
     }
@@ -686,7 +686,7 @@ class CommonFeatureValueAccessor {
         std::vector<float> embedx_w;
     */
     std::stringstream os;
-    os << "cpuptr: " << common_feature_value.CpuPtr(const_cast<float*>(v))
+    os << "cpu_ptr: " << common_feature_value.CpuPtr(const_cast<float*>(v))
        << " delta_score: " << v[2] << " show: " << v[3] << " click: " << v[4]
        << " embed_w:" << v[5] << " embed_g2sum:";
     for (int i = common_feature_value.EmbedG2SumIndex();
@@ -732,7 +732,7 @@ struct FeatureValue {
   friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) {
     out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot
         << " lr: " << val.lr << " mf_dim: " << val.mf_dim
-        << "cpuptr: " << val.cpu_ptr << " mf_size: " << val.mf_size << " mf:";
+        << "cpu_ptr: " << val.cpu_ptr << " mf_size: " << val.mf_size << " mf:";
     for (int i = 0; i < val.mf_dim + 1; ++i) {
       out << " " << val.mf[i];
     }
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
index 7523b349f1190..ed0c23a0fa8dc 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h
@@ -96,7 +96,7 @@ inline void debug_gpu_memory_info(int gpu_id, const char* desc) {
       err,
       cudaSuccess,
       platform::errors::InvalidArgument("cudaMemGetInfo failed!"));
-  VLOG(0) << "updatex gpu memory on device " << gpu_id << ", "
+  VLOG(0) << "update gpu memory on device " << gpu_id << ", "
           << "avail=" << avail / 1024.0 / 1024.0 / 1024.0 << "g, "
           << "total=" << total / 1024.0 / 1024.0 / 1024.0 << "g, "
           << "use_rate=" << (total - avail) / static_cast<double>(total)
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 18e3966b220c0..1ccca57cd2979 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -278,7 +278,7 @@ class HeterComm {
       h_push_fea_sizes.resize(node_size * node_size);
     }
   };
-  // pull parition shard key by devices
+  // pull partition shard key by devices
   struct PullResource {
     size_t h_recv_fea_num = 0;
     uint32_t* d_restore_keys_idx = nullptr;
diff --git a/paddle/fluid/framework/io/crypto/aes_cipher.cc b/paddle/fluid/framework/io/crypto/aes_cipher.cc
index 5a1258fa432f7..8802dc1b12158 100644
--- a/paddle/fluid/framework/io/crypto/aes_cipher.cc
+++ b/paddle/fluid/framework/io/crypto/aes_cipher.cc
@@ -12,19 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
 
 #include <cryptopp/aes.h>
diff --git a/paddle/fluid/framework/io/crypto/aes_cipher.h b/paddle/fluid/framework/io/crypto/aes_cipher.h
index a60fe1de45d7c..5dfdbd49f4f96 100644
--- a/paddle/fluid/framework/io/crypto/aes_cipher.h
+++ b/paddle/fluid/framework/io/crypto/aes_cipher.h
@@ -12,19 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 #pragma once
 
 #include <string>
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index f18b5fd668659..e59c495f2dd8d 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -366,7 +366,7 @@ std::unordered_map<Node*, int> CodeGenerator::EncodeVarNodes(
 
   // Encoding output vars.
   for (auto* out : output_var_nodes) {
-    VLOG(3) << "Ecoding output names:" << out->Name() << "(" << out
+    VLOG(3) << "Encoding output names:" << out->Name() << "(" << out
             << "), id:" << id;
     if (var_ids.find(out) == var_ids.end()) {
       var_ids[out] = id++;
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
index 1f8f3b63598ce..4eba0a73ae787 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -37,7 +37,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
     // if (!phi::GPUDeviceCode::IsAvailable()) {
     //   LOG(WARNING)
     //       << "Disable fusion_group because CUDA Driver or NVRTC is not
-    //       avaiable.";
+    //       available.";
     //   return 0;
     // }
 

From 41fb7f0f2db75878c2a1365714a271c1dace328e Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:41:36 +0800
Subject: [PATCH 61/82]  Fix some typos(supoort, etc) (#62041)

---
 .../generator/codegen_utils.py                |  4 +--
 .../generator/eager_gen.py                    | 10 +++---
 .../generator/python_c_gen.py                 | 36 ++++++++++---------
 paddle/fluid/eager/grad_node_info.h           |  6 ++--
 4 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
index fd19005cec39a..c13fb1cb4848c 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
@@ -118,7 +118,7 @@ def ReadFwdFile(filepath):
     # empty file loaded by yaml is None
     contents = yaml.load(f, Loader=yaml.FullLoader)
     f.close()
-    # not all fused ops supoort dygraph
+    # not all fused ops support dygraph
     if filepath.endswith("fused_ops.yaml") is True:
         new_apis = [
             api
@@ -134,7 +134,7 @@ def ReadBwdFile(filepath, bw_ops=None):
     f = open(filepath, 'r')
     if bw_ops is None:
         contents = yaml.load(f, Loader=yaml.FullLoader)
-        # not all fused ops supoort dygraph
+        # not all fused ops support dygraph
         if filepath.endswith("fused_backward.yaml") is True:
             new_apis = [
                 api
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index dad46949d70ea..62327c5aa8785 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -382,7 +382,7 @@ class {} : public egr::GradNodeBase {{
   }}
 """
 
-HIHGER_ORDER_DERIVATIVE_VALUE_TEMPLATE = """  if(trace_backward) {{
+HIGHER_ORDER_DERIVATIVE_VALUE_TEMPLATE = """  if(trace_backward) {{
 {}
     // Node Construction
 {}
@@ -1254,7 +1254,7 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
             )
         else:
             self.node_creation_str = (
-                HIHGER_ORDER_DERIVATIVE_VALUE_TEMPLATE.format(
+                HIGHER_ORDER_DERIVATIVE_VALUE_TEMPLATE.format(
                     node_creation_event_str,
                     node_construction_str,
                     set_attributes_str,
@@ -2266,7 +2266,7 @@ def GenerateNodeDefinition(
         backward_attrs_list = self.backward_attrs_list
         backward_inplace_map = self.backward_inplace_map
         indent = GetIndent(1)
-        need_gen_trace_backard_for_inplace = False
+        need_gen_trace_backward_for_inplace = False
 
         # Construct grad_api function args
         # Order: TensorWrappers, GradTensors, Attributes
@@ -2519,7 +2519,7 @@ def GenerateNodeDefinition(
   }} else {{
     {inplace_str}
   }}"""
-                        need_gen_trace_backard_for_inplace = True
+                        need_gen_trace_backward_for_inplace = True
                     else:
                         inplace_for_grad_outs_str += inplace_str
 
@@ -2623,7 +2623,7 @@ def GenerateNodeDefinition(
         if (
             len(next_grad_node_creation_str) > 0
             or is_invoke_forward_api
-            or need_gen_trace_backard_for_inplace
+            or need_gen_trace_backward_for_inplace
         ):
             compute_require_next_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n"
 
diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
index daf16f446ab12..777eea1221429 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -325,7 +325,7 @@ def __init__(self, forward_api_contents, namespace):
         # Generated Results
         self.python_c_function_str = ""
         self.python_c_function_reg_str = ""
-        self.python_c_funcion_declare_str = ""
+        self.python_c_function_declare_str = ""
 
     def CollectIsForwardOnly(self):
         forward_api_contents = self.forward_api_contents
@@ -515,7 +515,7 @@ def GeneratePythonCFunction(self):
             dygraph_function_call_str,
         )
 
-        # Generate Python-C Function Definetion
+        # Generate Python-C Function Definition
         self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
             forward_api_name,
             pythonc_record_event_str,
@@ -526,7 +526,7 @@ def GeneratePythonCFunction(self):
             noamp_dygraph_function_str,
             return_str,
         )
-        self.python_c_funcion_declare_str = (
+        self.python_c_function_declare_str = (
             PYTHON_C_FUNCTION_DECLARE_TEMPLATE.format(name=forward_api_name)
         )
 
@@ -572,7 +572,7 @@ def GeneratePythonCFunction(self):
                 "    return ToPyObject(out, args, inplace_var_idx_map);"
             )
 
-            # Generate Python-C Function Definetion
+            # Generate Python-C Function Definition
             python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format(
                 inplaced_forward_api_name,
                 pythonc_record_event_str,
@@ -584,7 +584,7 @@ def GeneratePythonCFunction(self):
                 return_str,
             )
 
-            python_c_funcion_declare_str = (
+            python_c_function_declare_str = (
                 PYTHON_C_FUNCTION_DECLARE_TEMPLATE.format(
                     name=inplaced_forward_api_name
                 )
@@ -603,13 +603,15 @@ def GeneratePythonCFunction(self):
             # self.forward_api_name ending with '_' means it only has inplace api
             if self.forward_api_name[-1] == '_':
                 self.python_c_function_str = python_c_inplace_func_str
-                self.python_c_funcion_declare_str = python_c_funcion_declare_str
+                self.python_c_function_declare_str = (
+                    python_c_function_declare_str
+                )
                 # Generate Python-C Function Registration
                 self.python_c_function_reg_str = python_c_inplace_func_reg_str
             else:
                 self.python_c_function_str += python_c_inplace_func_str
-                self.python_c_funcion_declare_str += (
-                    python_c_funcion_declare_str
+                self.python_c_function_declare_str += (
+                    python_c_function_declare_str
                 )
                 # Generate Python-C Function Registration
                 self.python_c_function_reg_str += python_c_inplace_func_reg_str
@@ -652,7 +654,7 @@ def __init__(self, path):
         # Generated Result
         self.python_c_functions_str = ""
         self.python_c_functions_reg_str = ""
-        self.python_c_funcion_declare_str = ""
+        self.python_c_function_declare_str = ""
 
     def GeneratePythonCFunctions(self):
         namespace = self.namespace
@@ -671,8 +673,8 @@ def GeneratePythonCFunctions(self):
                 self.python_c_functions_reg_str += (
                     f_generator.python_c_function_reg_str
                 )
-                self.python_c_funcion_declare_str += (
-                    f_generator.python_c_funcion_declare_str
+                self.python_c_function_declare_str += (
+                    f_generator.python_c_function_declare_str
                 )
 
     def AttachNamespace(self):
@@ -685,9 +687,9 @@ def AttachNamespace(self):
             self.python_c_functions_str = NAMESPACE_WRAPPER_TEMPLATE.format(
                 namespace, python_c_functions_str
             )
-            self.python_c_funcion_declare_str = (
+            self.python_c_function_declare_str = (
                 NAMESPACE_WRAPPER_TEMPLATE.format(
-                    namespace, self.python_c_funcion_declare_str
+                    namespace, self.python_c_function_declare_str
                 )
             )
 
@@ -766,20 +768,20 @@ def GeneratePythonCFile(filepath, python_c_str):
             py_c_generator.python_c_functions_reg_str
         )
         generated_python_c_functions_header += (
-            py_c_generator.python_c_funcion_declare_str
+            py_c_generator.python_c_function_declare_str
         )
 
     python_c_str = GeneratePythonCWrappers(
         generated_python_c_functions, generated_python_c_registration
     )
 
-    soucre_path = args.source_path
+    source_path = args.source_path
     header_path = args.header_path
-    for path in [soucre_path, header_path]:
+    for path in [source_path, header_path]:
         if os.path.exists(path):
             os.remove(path)
 
-    GeneratePythonCFile(soucre_path, python_c_str)
+    GeneratePythonCFile(source_path, python_c_str)
     GeneratePythonCFile(
         header_path,
         PYTHON_C_H_TEMPLATE.format(body=generated_python_c_functions_header),
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 248b4b88af4c0..7b5e36f4d5cdc 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -36,8 +36,8 @@ namespace egr {
  * TODO(yangzhanlue): GradNodeBase will also in charge of get the correct input
  * from GradOpDescMaker to GradNodeBase.
  *
- * NOTE: GradNodeBase has a method named run, this method should be overrided by
- * the specific derived class, it will prepare backward inputs and double
+ * NOTE: GradNodeBase has a method named run, this method should be overridden
+ *by the specific derived class, it will prepare backward inputs and double
  * backward's depends. Then, it will call C++ API of backward kernel functions
  * to finish backward computation.
  *
@@ -203,7 +203,7 @@ class GradNodeBase {
 
   /**
    * operator() designed to contain the real backward execution logic, it should
-   * be overrided by derived class defined for each operator. It accepts a
+   * be overridden by derived class defined for each operator. It accepts a
    * vector of Tensor which contains grads input of current operator
    *
    * Note: why we need backward inputs and outputs construct as vector of vector

From 63753f6d9255810252f9205d74c17068c9d052c1 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:42:37 +0800
Subject: [PATCH 62/82]  Fix some typos(retring, etc) (#62039)

---
 paddle/fluid/distributed/fleet_executor/message_bus.cc |  2 +-
 paddle/fluid/distributed/ps/service/heter_client.cc    |  2 +-
 paddle/fluid/distributed/ps/service/heter_server.cc    | 10 +++++-----
 paddle/fluid/distributed/ps/service/heter_server.h     |  8 ++++----
 .../fluid/operators/pscore/heter_listen_and_serv_op.cc |  2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index b5786e2393393..1860a8f3bf110 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -193,7 +193,7 @@ void MessageBus::ListenPort() {
   int interval = 100;
   while (server_.Start(ip_for_brpc, &options) != 0) {
     ++retry_times;
-    LOG(INFO) << "Message bus is retring for starting brpc for " << retry_times
+    LOG(INFO) << "Message bus is retrying for starting brpc for " << retry_times
               << " times. And will retry after " << interval / 1000
               << " seconds.";
     std::this_thread::sleep_for(std::chrono::milliseconds(interval));
diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
index e0744f96f91fa..8bdb749b0ecea 100644
--- a/paddle/fluid/distributed/ps/service/heter_client.cc
+++ b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -32,7 +32,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
   PADDLE_ENFORCE_EQ(var->IsType<phi::DenseTensor>(),
                     true,
                     platform::errors::InvalidArgument(
-                        "the type of micro id shoulde be LoDTensor."));
+                        "the type of micro id should be LoDTensor."));
   auto micro_id = -1;
   auto* tensor = var->GetMutable<phi::DenseTensor>();
   if (platform::is_cpu_place(tensor->place())) {
diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
index eb4d9b8304513..26dd4e6052c9b 100644
--- a/paddle/fluid/distributed/ps/service/heter_server.cc
+++ b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -28,10 +28,10 @@ void HeterServer::RegisterServiceHandler(std::string message_name,
   service_.RegisterServiceHandler(message_name, func);
 }
 
-void HeterServer::StartHeterService(bool neeed_encrypt) {
+void HeterServer::StartHeterService(bool need_encrypt) {
   server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE);
   brpc::ServerOptions options;
-  if (neeed_encrypt) {
+  if (need_encrypt) {
     options.mutable_ssl_options()->default_cert.certificate = "/cert.pem";
     options.mutable_ssl_options()->default_cert.private_key = "/key.pem";
   }
@@ -63,10 +63,10 @@ void HeterServer::StartHeterService(bool neeed_encrypt) {
   VLOG(4) << "start service done";
 }
 
-void HeterServer::StartHeterInterService(bool neeed_encrypt) {
+void HeterServer::StartHeterInterService(bool need_encrypt) {
   server_inter_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE);
   brpc::ServerOptions options;
-  if (neeed_encrypt) {
+  if (need_encrypt) {
     options.mutable_ssl_options()->default_cert.certificate = "/cert.pem";
     options.mutable_ssl_options()->default_cert.private_key = "/key.pem";
   }
@@ -99,7 +99,7 @@ void HeterServer::StartHeterInterService(bool neeed_encrypt) {
   VLOG(4) << "start service done";
 }
 
-void HeterServer::SetFanin(const int& fan_in) { service_.SetFanin(fan_in); }
+void HeterServer::SetFanIn(const int& fan_in) { service_.SetFanIn(fan_in); }
 
 void HeterServer::WaitServerReady() {
   std::unique_lock<std::mutex> lock(this->mutex_ready_);
diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h
index f556322eb7479..61531749e95a0 100644
--- a/paddle/fluid/distributed/ps/service/heter_server.h
+++ b/paddle/fluid/distributed/ps/service/heter_server.h
@@ -162,7 +162,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
       /*
       timeline_.Pause();
       if (timeline_.ElapsedSec() > FLAGS_switch_send_recv_timeout_s) {
-        VLOG(0) << "vars not consumed exceed 10 miniutes";
+        VLOG(0) << "vars not consumed exceed 10 minutes";
         break;
       }
       */
@@ -182,7 +182,7 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
       /*
       timeline_.Pause();
       if (timeline_.ElapsedSec() > FLAGS_switch_send_recv_timeout_s) {
-        VLOG(0) << "vars not produced exceed 10 miniutes";
+        VLOG(0) << "vars not produced exceed 10 minutes";
         break;
       }
       */
@@ -524,7 +524,7 @@ class HeterService : public PsService {
     peer_endpoints_ = peer_endpoints;
   }
 
-  void SetFanin(const int& fan_in) { fan_in_ = fan_in; }
+  void SetFanIn(const int& fan_in) { fan_in_ = fan_in; }
 
   void ForceExit() {
     VLOG(3) << "heter service force exit";
@@ -626,7 +626,7 @@ class HeterServer {
     service_.SetPeerEndPoints(peer_endpoints);
   }
 
-  void SetFanin(const int& fan_in);
+  void SetFanIn(const int& fan_in);
 
   void SetServiceHandler(
       std::shared_ptr<SendAndRecvVariableHandler> request_handler) {
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 5dafe26464d3d..978981a6fcdf3 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -144,7 +144,7 @@ void HeterListenAndServOp::RunImpl(const framework::Scope &scope,
 
   heter_server_ = distributed::HeterServer::GetInstance();
   heter_server_->SetEndPoint(endpoint);
-  heter_server_->SetFanin(fan_in);
+  heter_server_->SetFanIn(fan_in);
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>("optimize_blocks");

From 96fa4dcfc13726397b83b63902241bee9f76a22d Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 10:43:40 +0800
Subject: [PATCH 63/82] Fix some typos(is_taged, etc) (#62019)

---
 python/setup.py.in               | 10 +++++-----
 python/setup_cinn.py.in          |  6 +++---
 setup.py                         | 34 ++++++++++++++++----------------
 test/legacy_test/test_version.py |  4 ++--
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/python/setup.py.in b/python/setup.py.in
index 329f092d44801..f140b66bd1c44 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -94,7 +94,7 @@ def get_xpu_xhpc_version():
     else:
         return 'False'
 
-def is_taged():
+def is_tagged():
     try:
         cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
         git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
@@ -125,7 +125,7 @@ cudnn_version    = '%(cudnn)s'
 xpu_version      = '%(xpu)s'
 xpu_xccl_version = '%(xpu_xccl)s'
 xpu_xhpc_version = '%(xpu_xhpc)s'
-istaged          = %(istaged)s
+is_tagged          = %(is_tagged)s
 commit           = '%(commit)s'
 with_mkl         = '%(with_mkl)s'
 cinn_version      = '%(cinn)s'
@@ -195,7 +195,7 @@ def show():
             >>> # doctest: -SKIP
 
     """
-    if istaged:
+    if is_tagged:
         print('full_version:', full_version)
         print('major:', major)
         print('minor:', minor)
@@ -344,7 +344,7 @@ def cinn():
             'xpu_xccl': get_xpu_xccl_version(),
             'xpu_xhpc': get_xpu_xhpc_version(),
             'commit': commit,
-            'istaged': is_taged(),
+            'is_tagged': is_tagged(),
             'with_mkl': '@WITH_MKL@',
             'cinn': get_cinn_version()})
 
@@ -828,7 +828,7 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             if('${WITH_SHARED_IR}' == 'ON'):
                 # change rpath of pir.ext for loading 3rd party lib
                 commands.append("patchelf --set-rpath '$ORIGIN:$ORIGIN/../libs' ${PADDLE_BINARY_DIR}/python/paddle/libs/${IR_NAME}")
-        # The sw_64 not suppot patchelf, so we just disable that.
+        # The sw_64 not support patchelf, so we just disable that.
         if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
             for command in commands:
                 if os.system(command) != 0:
diff --git a/python/setup_cinn.py.in b/python/setup_cinn.py.in
index cbdef191c4cd3..18d94a1629d27 100644
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -63,7 +63,7 @@ def get_cudnn_version():
     else:
         return 'False'
 
-def is_taged():
+def is_tagged():
     try:
         cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
         git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd="${PROJECT_SOURCE_DIR}").communicate()[0].strip()
@@ -85,7 +85,7 @@ minor           = '%(minor)d'
 patch           = '%(patch)s'
 cuda_version    = '%(cuda)s'
 cudnn_version   = '%(cudnn)s'
-istaged         = %(istaged)s
+is_tagged         = %(is_tagged)s
 commit          = '%(commit)s'
 with_mkl        = '%(with_mkl)s'
 '''
@@ -108,7 +108,7 @@ with_mkl        = '%(with_mkl)s'
             'cuda': get_cuda_version(),
             'cudnn': get_cudnn_version(),
             'commit': commit,
-            'istaged': is_taged(),
+            'is_tagged': is_tagged(),
             'with_mkl': '${WITH_MKL}'})
 
 write_version_py(filename='${CMAKE_BINARY_DIR}/python/cinn/version/info.py')
diff --git a/setup.py b/setup.py
index f19c22f909d07..215f767b73d53 100644
--- a/setup.py
+++ b/setup.py
@@ -76,7 +76,7 @@ def filter_setup_args(input_args):
     filter_args_list = []
     for arg in input_args:
         if arg == 'rerun-cmake':
-            rerun_cmake = True  # delete Cmakecache.txt and rerun cmake
+            rerun_cmake = True  # delete CMakeCache.txt and rerun cmake
             continue
         if arg == 'only-cmake':
             only_cmake = True  # only cmake and do not make, leave a chance for users to adjust build options
@@ -314,7 +314,7 @@ def git_commit():
 def _get_version_detail(idx):
     assert (
         idx < 3
-    ), "vesion info consists of %(major)d.%(minor)d.%(patch)d, \
+    ), "version info consists of %(major)d.%(minor)d.%(patch)d, \
         so detail index must less than 3"
     tag_version_regex = env_dict.get("TAG_VERSION_REGEX")
     paddle_version = env_dict.get("PADDLE_VERSION")
@@ -400,7 +400,7 @@ def get_xpu_xhpc_version():
         return 'False'
 
 
-def is_taged():
+def is_tagged():
     try:
         cmd = [
             'git',
@@ -447,7 +447,7 @@ def write_version_py(filename='paddle/version/__init__.py'):
 xpu_version      = '%(xpu)s'
 xpu_xccl_version = '%(xpu_xccl)s'
 xpu_xhpc_version = '%(xpu_xhpc)s'
-istaged          = %(istaged)s
+is_tagged          = %(is_tagged)s
 commit           = '%(commit)s'
 with_mkl         = '%(with_mkl)s'
 cinn_version      = '%(cinn)s'
@@ -516,7 +516,7 @@ def show():
             cinn: False
             >>> # doctest: -SKIP
     """
-    if istaged:
+    if is_tagged:
         print('full_version:', full_version)
         print('major:', major)
         print('minor:', minor)
@@ -667,7 +667,7 @@ def cinn():
                 'xpu_xccl': get_xpu_xccl_version(),
                 'xpu_xhpc': get_xpu_xhpc_version(),
                 'commit': commit,
-                'istaged': is_taged(),
+                'is_tagged': is_tagged(),
                 'with_mkl': env_dict.get("WITH_MKL"),
                 'cinn': get_cinn_version(),
             }
@@ -824,13 +824,13 @@ def cmake_run(build_path):
         subprocess.check_call(cmake_args)
 
 
-def build_run(args, build_path, envrion_var):
+def build_run(args, build_path, environ_var):
     with cd(build_path):
         build_args = []
         build_args.append(CMAKE)
         build_args += args
         try:
-            subprocess.check_call(build_args, cwd=build_path, env=envrion_var)
+            subprocess.check_call(build_args, cwd=build_path, env=environ_var)
         except (CalledProcessError, KeyboardInterrupt) as e:
             sys.exit(1)
 
@@ -870,7 +870,7 @@ def build_steps():
     print("build_dir:", build_dir)
     # run cmake to generate native build files
     cmake_cache_file_path = os.path.join(build_path, "CMakeCache.txt")
-    # if rerun_cmake is True,remove CMakeCache.txt and rerun camke
+    # if rerun_cmake is True,remove CMakeCache.txt and rerun cmake
     if os.path.isfile(cmake_cache_file_path) and rerun_cmake is True:
         os.remove(cmake_cache_file_path)
 
@@ -880,13 +880,13 @@ def build_steps():
     if os.path.exists(cmake_cache_file_path) and not (
         bool_ninja and not os.path.exists(build_ninja_file_path)
     ):
-        print("Do not need rerun camke, everything is ready, run build now")
+        print("Do not need rerun cmake, everything is ready, run build now")
     else:
         cmake_run(build_path)
     # make
     if only_cmake:
         print(
-            "You have finished running cmake, the program exited,run 'ccmake build' to adjust build options and 'python setup.py install to build'"
+            "You have finished running cmake, the program exited,run 'cmake build' to adjust build options and 'python setup.py install to build'"
         )
         sys.exit()
     run_cmake_build(build_path)
@@ -1269,7 +1269,7 @@ def get_package_data_and_package_dir():
                         + '/python/paddle/libs/'
                         + env_dict.get("IR_NAME")
                     )
-            # The sw_64 not suppot patchelf, so we just disable that.
+            # The sw_64 not support patchelf, so we just disable that.
             if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
                 for command in commands:
                     if os.system(command) != 0:
@@ -1610,11 +1610,11 @@ def check_build_dependency():
             f.read().splitlines()
         )  # Specify the dependencies to install
 
-    python_dependcies_module = []
+    python_dependencies_module = []
     installed_packages = []
 
     for dependency in build_dependencies:
-        python_dependcies_module.append(
+        python_dependencies_module.append(
             re.sub("_|-", '', re.sub(r"==.*|>=.*|<=.*", '', dependency))
         )
     reqs = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze'])
@@ -1624,7 +1624,7 @@ def check_build_dependency():
             re.sub("_|-", '', r.decode().split('==')[0]).lower()
         )
 
-    for dependency in python_dependcies_module:
+    for dependency in python_dependencies_module:
         if dependency.lower() not in installed_packages:
             raise RuntimeError(missing_modules.format(dependency=dependency))
 
@@ -1633,7 +1633,7 @@ def install_cpp_dist_and_build_test(install_dir, lib_test_dir, headers, libs):
     """install cpp distribution and build test target
 
     TODO(huangjiyi):
-    1. This function will be moved when seperating C++ distribution
+    1. This function will be moved when separating C++ distribution
     installation from python package installation.
     2. Reduce the header and library files to be installed.
     """
@@ -1705,7 +1705,7 @@ def submodules_not_exists_or_empty(folder):
             end = time.time()
             print(f' --- Submodule initialization took {end - start:.2f} sec')
         except Exception:
-            print(' --- Submodule initalization failed')
+            print(' --- Submodule initialization failed')
             print('Please run:\n\tgit submodule update --init --recursive')
             sys.exit(1)
 
diff --git a/test/legacy_test/test_version.py b/test/legacy_test/test_version.py
index 2b6d8f599c582..830a0cc0f290c 100644
--- a/test/legacy_test/test_version.py
+++ b/test/legacy_test/test_version.py
@@ -30,10 +30,10 @@ def setUp(self):
     def test_check_output(self):
         # check commit format
         self.assertTrue(re.match(self._commit_regex, base_version.commit))
-        self.assertTrue(isinstance(base_version.istaged, bool))
+        self.assertTrue(isinstance(base_version.is_tagged, bool))
 
         # check version format
-        if base_version.istaged:
+        if base_version.is_tagged:
             self.assertTrue(re.match(self._major_regex, base_version.major))
             self.assertTrue(re.match(self._minor_regex, base_version.minor))
             self.assertTrue(re.match(self._patch_regex, base_version.patch))

From 508c6ffa81d64215e1338426915012214b539ad6 Mon Sep 17 00:00:00 2001
From: Nyakku Shigure <sigure.qaq@gmail.com>
Date: Mon, 26 Feb 2024 10:53:50 +0800
Subject: [PATCH 64/82] [PIR] Fix a typo in `instruction_base` (#62052)

---
 .../framework/new_executor/instruction/instruction_base.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
index bb58d01f91d8e..cfdd5d3f9d7a9 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
@@ -288,7 +288,7 @@ void InstructionBase::InitInputsOutputsIds(
     ::pir::Operation* op, const ValueExecutionInfo& value_exec_info) {
   auto op_attributes = op->attributes();
   std::string op_name;
-  if (op_attributes.count("op_name ")) {
+  if (op_attributes.count("op_name")) {
     op_name =
         op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
   }

From e97328da23351c02ad71b68eb8b14da13fa74042 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:11:46 +0800
Subject: [PATCH 65/82] Rename paddle/cinn/common/arithmatic.h (#61910)

* Fix

* ci

* ci

* Fix
---
 paddle/cinn/common/CMakeLists.txt                |  4 ++--
 .../cinn/common/{arithmatic.cc => arithmetic.cc} |  2 +-
 .../cinn/common/{arithmatic.h => arithmetic.h}   | 16 ++++++++--------
 .../{arithmatic_test.cc => arithmetic_test.cc}   |  2 +-
 paddle/cinn/common/cas.cc                        |  2 +-
 paddle/cinn/common/integer_set.cc                |  2 +-
 paddle/cinn/ir/tensor.cc                         |  2 +-
 paddle/cinn/optim/ir_simplify.cc                 |  2 +-
 paddle/cinn/optim/transform_polyfor_to_for.cc    |  2 +-
 9 files changed, 17 insertions(+), 17 deletions(-)
 rename paddle/cinn/common/{arithmatic.cc => arithmetic.cc} (99%)
 rename paddle/cinn/common/{arithmatic.h => arithmetic.h} (87%)
 rename paddle/cinn/common/{arithmatic_test.cc => arithmetic_test.cc} (98%)

diff --git a/paddle/cinn/common/CMakeLists.txt b/paddle/cinn/common/CMakeLists.txt
index 123c848ac8358..e9c4523edd323 100644
--- a/paddle/cinn/common/CMakeLists.txt
+++ b/paddle/cinn/common/CMakeLists.txt
@@ -16,7 +16,7 @@ gather_srcs(
   ir_util.cc
   test_helper.cc
   # cuda_test_helper.cc
-  arithmatic.cc
+  arithmetic.cc
   cas.cc
   union_find.cc
   python_interpreter_guard.cc
@@ -36,7 +36,7 @@ cinn_cc_test(test_topo_walker SRCS topo_walker_test.cc DEPS gtest glog)
 cinn_cc_test(test_cinn_value SRCS cinn_value_test.cc DEPS cinncore)
 cinn_cc_test(test_shared SRCS shared_test.cc DEPS cinncore)
 cinn_cc_test(test_graph_utils SRCS graph_utils_test.cc DEPS cinncore)
-cinn_cc_test(test_arithmatic SRCS arithmatic_test.cc DEPS cinncore)
+cinn_cc_test(test_arithmetic SRCS arithmetic_test.cc DEPS cinncore)
 cinn_cc_test(test_cas SRCS cas_test.cc DEPS cinncore)
 cinn_cc_test(test_type SRCS type_test.cc DEPS cinncore)
 cinn_cc_test(test_axis SRCS axis_test.cc DEPS cinncore)
diff --git a/paddle/cinn/common/arithmatic.cc b/paddle/cinn/common/arithmetic.cc
similarity index 99%
rename from paddle/cinn/common/arithmatic.cc
rename to paddle/cinn/common/arithmetic.cc
index 5cabe56dff2db..e2c4ed1b8a6a7 100644
--- a/paddle/cinn/common/arithmatic.cc
+++ b/paddle/cinn/common/arithmetic.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmetic.h"
 
 #include <map>
 #include <mutex>
diff --git a/paddle/cinn/common/arithmatic.h b/paddle/cinn/common/arithmetic.h
similarity index 87%
rename from paddle/cinn/common/arithmatic.h
rename to paddle/cinn/common/arithmetic.h
index e73a9bac3ede2..f90b795c8c8ff 100644
--- a/paddle/cinn/common/arithmatic.h
+++ b/paddle/cinn/common/arithmetic.h
@@ -13,20 +13,20 @@
 // limitations under the License.
 
 /**
- * This file includes some arithmatic utilities, such as simplifying/solving a
+ * This file includes some arithmetic utilities, such as simplifying/solving a
  * math equation/CINN expression.
  */
 #pragma once
 
-#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir.h"  // NOLINT, Should be in front of other header files
 
-#include <ginac/ginac.h>
+#include <ginac/ginac.h>  // NOLINT
 
-#include <limits>
-#include <map>
-#include <set>
-#include <string>
-#include <tuple>
+#include <limits>  // NOLINT
+#include <map>     // NOLINT
+#include <set>     // NOLINT
+#include <string>  // NOLINT
+#include <tuple>   // NOLINT
 
 #ifdef As
 #undef As
diff --git a/paddle/cinn/common/arithmatic_test.cc b/paddle/cinn/common/arithmetic_test.cc
similarity index 98%
rename from paddle/cinn/common/arithmatic_test.cc
rename to paddle/cinn/common/arithmetic_test.cc
index 32eb30f9f6965..4382f279bc43a 100644
--- a/paddle/cinn/common/arithmatic_test.cc
+++ b/paddle/cinn/common/arithmetic_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmetic.h"
 
 #include <ginac/ginac.h>
 #include <glog/logging.h>
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index ddfbfce983fcb..f2e93286a04a7 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -19,7 +19,7 @@
 #include <string>
 #include <utility>
 
-#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmetic.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/ir_printer.h"
diff --git a/paddle/cinn/common/integer_set.cc b/paddle/cinn/common/integer_set.cc
index 2a344eb00d5a5..f6d6446b9bb24 100644
--- a/paddle/cinn/common/integer_set.cc
+++ b/paddle/cinn/common/integer_set.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/cinn/common/integer_set.h"
 
-#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmetic.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index 7b3f15c6ed0be..5224a2172ac5c 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -18,7 +18,7 @@
 
 #include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/cinn.h"
-#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmetic.h"
 #include "paddle/cinn/common/axis.h"
 #include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/common/common.h"
diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
index fa69dd19ff0c6..c92ac15e5deb6 100644
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -21,7 +21,7 @@
 #include <map>
 #include <string>
 
-#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmetic.h"
 #include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/ir_mutator.h"
diff --git a/paddle/cinn/optim/transform_polyfor_to_for.cc b/paddle/cinn/optim/transform_polyfor_to_for.cc
index 8a7392ed5d54b..ff29bb0058801 100644
--- a/paddle/cinn/optim/transform_polyfor_to_for.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for.cc
@@ -17,7 +17,7 @@
 #include <cmath>
 #include <vector>
 
-#include "paddle/cinn/common/arithmatic.h"
+#include "paddle/cinn/common/arithmetic.h"
 #include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/common/type.h"

From 72135041fdff8489cc62ca887a1158c665a2de39 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:13:20 +0800
Subject: [PATCH 66/82] Fix some typos (protocal, staic, etc.),
 test=document_fix (#61882)

* Fix, test=document_fix

* ci
---
 cmake/cinn/core.cmake          |  2 +-
 cmake/cinn/external/absl.cmake |  2 +-
 cmake/cuda.cmake               |  2 +-
 cmake/external/cub.cmake       |  2 +-
 cmake/external/lapack.cmake    |  2 +-
 cmake/external/mklml.cmake     |  2 +-
 cmake/external/protobuf.cmake  |  2 +-
 cmake/external/pybind11.cmake  |  2 +-
 cmake/flags.cmake              |  2 +-
 cmake/generic.cmake            | 18 +++++++++---------
 cmake/init.cmake               |  4 ++--
 cmake/phi.cmake                |  4 ++--
 cmake/third_party.cmake        | 12 ++++++------
 cmake/unity_build.cmake        |  4 ++--
 14 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/cmake/cinn/core.cmake b/cmake/cinn/core.cmake
index dedefc57a698b..66741e7f8182b 100644
--- a/cmake/cinn/core.cmake
+++ b/cmake/cinn/core.cmake
@@ -242,7 +242,7 @@ function(cinn_merge_static_libs TARGET_NAME)
       COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
       DEPENDS ${libs})
 
-    # Generate dummy staic lib
+    # Generate dummy static lib
     file(WRITE ${target_SRCS}
          "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
     add_library(${TARGET_NAME} STATIC ${target_SRCS})
diff --git a/cmake/cinn/external/absl.cmake b/cmake/cinn/external/absl.cmake
index 0b3f3d685ed80..8d9e0e45b45ba 100644
--- a/cmake/cinn/external/absl.cmake
+++ b/cmake/cinn/external/absl.cmake
@@ -50,7 +50,7 @@ ExternalProject_Add(
   BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_bad_variant_access.a
   BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_raw_hash_set.a)
 
-# It may be more convinent if we just include all absl libs
+# It may be more convenient if we just include all absl libs
 set(ABSL_LIB_NAMES
     hash
     wyhash
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 8cd7b835629d3..81a7228629d25 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -300,7 +300,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # So, don't set these flags here.
 set(CMAKE_CUDA_STANDARD 17)
 
-# (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
+# (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflict with -w
 # So replace /W[1-4] with /W0
 if(WIN32)
   string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index 74f0d3188b534..681cfb2cfa6cf 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -29,7 +29,7 @@ if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6)
   set(CUB_TAG 1.16.0)
   execute_process(COMMAND git --git-dir=${CUB_SOURCE_DIR}/.git
                           --work-tree=${CUB_SOURCE_DIR} checkout ${CUB_TAG})
-  # cub 1.16.0 is not compitable with current thrust version
+  # cub 1.16.0 is not compatible with current thrust version
   add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
 else()
   set(CUB_TAG 1.8.0)
diff --git a/cmake/external/lapack.cmake b/cmake/external/lapack.cmake
index 1b5032ab6ff1b..62da0987085d1 100644
--- a/cmake/external/lapack.cmake
+++ b/cmake/external/lapack.cmake
@@ -20,7 +20,7 @@ set(LAPACK_DOWNLOAD_DIR
 set(LAPACK_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lapack)
 set(LAPACK_LIB_DIR ${LAPACK_INSTALL_DIR}/lib)
 
-# Note(zhouwei): lapack need fortan compiler which many machines don't have, so use precompiled library.
+# Note(zhouwei): lapack need fortran compiler which many machines don't have, so use precompiled library.
 # use lapack tag v3.10.0 on 06/28/2021 https://github.com/Reference-LAPACK/lapack
 if(LINUX)
   set(LAPACK_FILE
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index d5e3fa16bf0e2..f7c2035cd0a1f 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -35,7 +35,7 @@ if(WIN32)
 else()
   #TODO(intel-huying):
   #  Now enable csrmm function in mklml library temporarily,
-  #  it will be updated as offical version later.
+  #  it will be updated as official version later.
   set(MKLML_FILE
       "csrmm_mklml_lnx_2019.0.5.tgz"
       CACHE STRING "" FORCE)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index ab5ab5e47604e..09aa9a2b0726e 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -322,7 +322,7 @@ if(WITH_IPU)
 elseif(WITH_ARM_BRPC)
   set(PROTOBUF_VERSION 21.12-baidu-ee-common)
 elseif(WIN32)
-  #Lower version prootbuf is used for widows
+  #Lower version protobuf is used for widows
   set(PROTOBUF_VERSION 21.12)
 else()
   set(PROTOBUF_VERSION 21.12)
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 6ce8290d72f42..dcb890b294cfb 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -28,7 +28,7 @@ if(NOT WIN32)
   file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/pybind/cast.h.patch
        native_dst)
   # Note: [Why calling some `git` commands before `patch`?]
-  # Paddle's CI uses cache to accelarate the make process. However, error might raise when patch codes in two scenarios:
+  # Paddle's CI uses cache to accelerate the make process. However, error might raise when patch codes in two scenarios:
   # 1. Patch to the wrong version: the tag version of CI's cache falls behind PYBIND_TAG, use `git checkout ${PYBIND_TAG}` to solve this.
   # 2. Patch twice: the tag version of cache == PYBIND_TAG, but patch has already applied to cache.
   set(PYBIND_PATCH_COMMAND
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index bb94e5627bd62..ee60dd1485818 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -17,7 +17,7 @@ function(CheckCompilerCXX14Flag)
   elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID
                                                         STREQUAL "Clang")
     # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
-    # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
+    # Apple Clang is a different compiler than upstream Clang which has different version numbers.
     # https://gist.github.com/yamaya/2924292
     if(APPLE) # cmake < 3.0 compiler id "Clang" on Mac OS X
       if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 49eec5ba879e0..c18e25fa84a64 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -44,7 +44,7 @@
 #
 #   nv_library(example SRCS example.cu)
 #
-# To specify that a library new_example.a depends on other libraies:
+# To specify that a library new_example.a depends on other libraries:
 #
 #   cc_library(new_example SRCS new_example.cc DEPS example)
 #
@@ -72,7 +72,7 @@
 #   nv_test(example_test SRCS example_test.cu DEPS example)
 #
 # It is pretty often that executable and test binaries depend on
-# pre-defined external libaries like glog and gflags defined in
+# pre-defined external libraries like glog and gflags defined in
 # /cmake/external/*.cmake:
 #
 #   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
@@ -257,7 +257,7 @@ function(merge_static_libs TARGET_NAME)
     COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
     DEPENDS ${libs})
 
-  # Generate dummy staic lib
+  # Generate dummy static lib
   generate_dummy_static_lib(LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS}
                             GENERATOR "generic.cmake:merge_static_libs")
   target_link_libraries(${TARGET_NAME} ${libs_deps})
@@ -310,7 +310,7 @@ function(merge_static_libs TARGET_NAME)
     foreach(lib ${libs})
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
     endforeach()
-    # msvc compiler will put libarary in directory of "/Release/xxxlib" by default
+    # msvc compiler will put library in directory of "/Release/xxxlib" by default
     add_custom_command(
       TARGET ${TARGET_NAME}
       POST_BUILD
@@ -530,7 +530,7 @@ function(cc_test TARGET_NAME)
                           "${multiValueArgs}" ${ARGN})
     if(WIN32)
       # NOTE(zhiqiu): on windows platform, the symbols should be exported
-      # explicitly by __declspec(dllexport), however, there are serveral
+      # explicitly by __declspec(dllexport), however, there are several
       # symbols not exported, and link error occurs.
       # so, the tests are not built against dynamic libraries now.
       cc_test_old(
@@ -577,7 +577,7 @@ function(cc_test_old TARGET_NAME)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
     cc_test_build(${TARGET_NAME} SRCS ${cc_test_SRCS} DEPS ${cc_test_DEPS})
-    # we dont test hcom op, because it need complex configuration
+    # we donot test hcom op, because it need complex configuration
     # with more than one machine
     cc_test_run(${TARGET_NAME} COMMAND ${TARGET_NAME} ARGS ${cc_test_ARGS})
   elseif(WITH_TESTING AND NOT TEST ${TARGET_NAME})
@@ -809,7 +809,7 @@ function(hip_binary TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
-    # FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files
+    # FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is required for .cc files
     hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
     if(hip_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
@@ -826,7 +826,7 @@ function(hip_test TARGET_NAME)
     cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
     # FindHIP.cmake defined hip_add_executable,
-    # HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files
+    # HIP_SOURCE_PROPERTY_FORMAT is required for .cc files
     hip_add_executable(${TARGET_NAME} ${hip_test_SRCS})
     # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
     target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
@@ -1231,7 +1231,7 @@ function(grpc_library TARGET_NAME)
   get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
   get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
 
-  # FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
+  # FIXME(putcn): the following line is supposed to generate *.pb.h and cc, but
   # somehow it didn't. line 602 to 604 is to patching this. Leaving this here
   # for now to enable dist CI.
   paddle_protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 86c43cb233bfc..201f66be82f72 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -43,8 +43,8 @@ else()
     set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG")
   endif()
 
-  # It can specify CUDA compile flag manualy,
-  # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
+  # It can specify CUDA compile flag manually,
+  # its use is to remove /Zi to reduce GPU static library size. But it's dangerous
   # because CUDA will update by nvidia, then error will occur.
   # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
   set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index bfb6a88eb62a7..4aabcbb0f7607 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -119,7 +119,7 @@ function(kernel_declare TARGET_LIST)
             is_all_backend
             "${first_registry}")
         if(NOT is_all_backend STREQUAL "")
-          # parse the registerd kernel message
+          # parse the registered kernel message
           string(
             REPLACE "PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE_EXCEPT_CUSTOM("
                     "" kernel_msg "${first_registry}")
@@ -131,7 +131,7 @@ function(kernel_declare TARGET_LIST)
               is_all_backend
               "${first_registry}")
 
-          # parse the registerd kernel message
+          # parse the registered kernel message
           string(REPLACE "PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(" ""
                          kernel_msg "${first_registry}")
         endif()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 17c428660b223..2d8020adcf7d0 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 include(ExternalProject)
-# Creat a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
+# Create a target named "third_party", which can compile external dependencies on all platform(windows/linux/mac)
 
 set(THIRD_PARTY_PATH
     "${CMAKE_BINARY_DIR}/third_party"
@@ -55,8 +55,8 @@ if(NOT WITH_SETUP_INSTALL)
   endif()
 
 endif()
-# cache funciton to avoid repeat download code of third_party.
-# This function has 4 parameters, URL / REPOSITOR / TAG / DIR:
+# cache function to avoid repeat download code of third_party.
+# This function has 4 parameters, URL / REPOSITORY / TAG / DIR:
 # 1. URL:           specify download url of 3rd party
 # 2. REPOSITORY:    specify git REPOSITORY of 3rd party
 # 3. TAG:           specify git tag/branch/commitID of 3rd party
@@ -64,7 +64,7 @@ endif()
 #
 # The function Return 1 PARENT_SCOPE variables:
 #  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
-#                            and you no longer need to set any donwnload steps in ExternalProject_Add.
+#                            and you no longer need to set any download steps in ExternalProject_Add.
 # For example:
 #    Cache_third_party(${TARGET}
 #            REPOSITORY ${TARGET_REPOSITORY}
@@ -145,10 +145,10 @@ macro(UNSET_VAR VAR_NAME)
   unset(${VAR_NAME})
 endmacro()
 
-# Funciton to Download the dependencies during compilation
+# Function to Download the dependencies during compilation
 # This function has 2 parameters, URL / DIRNAME:
 # 1. URL:           The download url of 3rd dependencies
-# 2. NAME:          The name of file, that determin the dirname
+# 2. NAME:          The name of file, that determine the dirname
 #
 function(file_download_and_uncompress URL NAME)
   set(options "")
diff --git a/cmake/unity_build.cmake b/cmake/unity_build.cmake
index d1b97cf08f60c..f7c88a6bb4a4e 100644
--- a/cmake/unity_build.cmake
+++ b/cmake/unity_build.cmake
@@ -74,7 +74,7 @@ endfunction()
 # If the source file does not hit any registed groups, use itself.
 # This function put the actual combination relationship in variables instead of
 # writing the unity source file. The reason is that writing unity source file
-# will change the timestampe and affect the effect of retaining the build
+# will change the timestamp and affect the effect of retaining the build
 # directory on Windows.
 # Here you need to specify the source type which belongs to cc or cu.
 function(compose_unity_target_sources TARGET TYPE)
@@ -84,7 +84,7 @@ function(compose_unity_target_sources TARGET TYPE)
                PROPERTY ${TARGET}_${TYPE}_group_index)
   foreach(src ${ARGN})
     set(unity_file "")
-    # Note(zhouwei25): UB use the path releative to CMAKE_SOURCE_DIR.
+    # Note(zhouwei25): UB use the path relative to CMAKE_SOURCE_DIR.
     # If use absolute path, sccache/ccache hit rate will be reduced.
     if(IS_ABSOLUTE ${src})
       set(src_absolute_path ${src})

From 665f97bb6c020c53ec7951547875a41299b038cd Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:17:40 +0800
Subject: [PATCH 67/82]  Fix some typos (optioanl, etc.) (#61836)

---
 .../event_garbage_collector.cc                |  8 +++----
 .../pir_adaptor/pir_adaptor_util.cc           | 23 ++++++++++---------
 .../pir_adaptor/pir_adaptor_util.h            |  4 ++--
 .../pir/transforms/constant_folding_pass.cc   | 12 +++++-----
 paddle/phi/api/yaml/generator/dist_api_gen.py |  6 ++---
 5 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
index e8bee7705fe30..1b4f5128589d6 100644
--- a/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
+++ b/paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.cc
@@ -31,8 +31,8 @@ InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector(
                            /*allow_spinning*/ true,
                            /*track_task*/ false);
   queue_ = CreateSingleThreadedWorkQueue(options);
-  for (auto& instruc : vec_instruction) {
-    gc_event_.emplace_back(instruc.DeviceContext().GetPlace(),
+  for (auto& instruct : vec_instruction) {
+    gc_event_.emplace_back(instruct.DeviceContext().GetPlace(),
                            platform::GenerateDeviceEventFlag());
   }
 }
@@ -44,8 +44,8 @@ InterpreterCoreEventGarbageCollector::InterpreterCoreEventGarbageCollector(
                            /*allow_spinning*/ true,
                            /*track_task*/ false);
   queue_ = CreateSingleThreadedWorkQueue(options);
-  for (auto& instruc : vec_instruction) {
-    gc_event_.emplace_back(instruc->DeviceContext().GetPlace(),
+  for (auto& instruct : vec_instruction) {
+    gc_event_.emplace_back(instruct->DeviceContext().GetPlace(),
                            platform::GenerateDeviceEventFlag());
   }
 }
diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
index aa9003cb164f9..4894e64a8f4d1 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.cc
@@ -497,7 +497,7 @@ void HandleForSpecialOp(pir::Operation* op,
     }
     PADDLE_ENFORCE(var,
                    paddle::platform::errors::InvalidArgument(
-                       "The variable %s shoud exist", name));
+                       "The variable %s should exist", name));
 
     value_exe_info->Add(value, name);
   } else if (op->isa<pir::CombineOp>()) {
@@ -531,7 +531,7 @@ void HandleForSpecialOp(pir::Operation* op,
                           .AsString();
 
     auto value = op->operand_source(0);
-    // change opreand name to param_name
+    // change operand name to param_name
     auto orig_name = value_exe_info->GetValue2VarName().at(value);
 
     if (param_name == orig_name) {
@@ -547,22 +547,23 @@ void HandleForSpecialOp(pir::Operation* op,
 
     value_exe_info->Rename(param_name, orig_name);
   } else if (op->isa<pir::ShadowOutputOp>()) {
-    VLOG(6) << "Handle for builtin.shadow_ouptut";
+    VLOG(6) << "Handle for builtin.shadow_output";
     auto var_name = op->attributes()
                         .at("output_name")
                         .dyn_cast<pir::StrAttribute>()
                         .AsString();
 
     auto value = op->operand_source(0);
+
     Scope* scope = const_cast<Scope*>(value_exe_info->GetScope());
     if (value.defining_op()->HasAttribute(kAttrIsPersistable) &&
         value.attribute<pir::BoolAttribute>(kAttrIsPersistable).data()) {
-      VLOG(6) << "Handle for builtin.shadow_ouptut persistable value:"
+      VLOG(6) << "Handle for builtin.shadow_output persistable value:"
               << var_name;
       scope = const_cast<Scope*>(value_exe_info->GetScope()->root());
     }
 
-    // change opreand name to param_name
+    // change operand name to param_name
     auto orig_name = value_exe_info->GetValue2VarName().at(value);
 
     if (var_name == orig_name) {
@@ -603,7 +604,7 @@ void HandleForSpecialOp(pir::Operation* op,
     PADDLE_ENFORCE_EQ(value_exe_info->GetValue2VarName().count(in_value),
                       true,
                       phi::errors::PreconditionNotMet(
-                          "input of buildin slice not in name map"));
+                          "input of builtin slice not in name map"));
 
     int index =
         op->attributes().at("index").dyn_cast<pir::Int32Attribute>().data();
@@ -626,7 +627,7 @@ void HandleForSpecialOp(pir::Operation* op,
     PADDLE_ENFORCE_EQ(value_exe_info->GetValue2VarName().count(in_value),
                       true,
                       phi::errors::PreconditionNotMet(
-                          "input of buildin split not in name map"));
+                          "input of builtin split not in name map"));
 
     auto in_var = value_exe_info->GetVarByValue(in_value);
     auto variable_array = in_var->Get<VariableRefArray>();
@@ -817,7 +818,7 @@ void BuildRuntimeContext(pir::Operation* op,
     pir::Value ptr = op->operand_source(index);
 
     if (!IsInvalid(ptr)) {
-      VLOG(8) << "ctx->EmplaceBackInput : an optioanl input " << name;
+      VLOG(8) << "ctx->EmplaceBackInput : an optional input " << name;
       continue;
     }
 
@@ -845,7 +846,7 @@ void BuildRuntimeContext(pir::Operation* op,
     auto legacy_arg_name = op_normalizer.GetLegacyArgName(fluid_op_name, name);
 
     if (!IsInvalid(ptr)) {
-      VLOG(8) << "ctx->EmplaceBackOutput : an optioanl output " << name;
+      VLOG(8) << "ctx->EmplaceBackOutput : an optional output " << name;
       continue;
     }
 
@@ -906,7 +907,7 @@ std::shared_ptr<OperatorBase> BuildOperatorBase(
     auto legacy_attr_name = op_normalizer.GetLegacyArgName(fluid_op_name, name);
 
     if (!IsInvalid(ptr)) {
-      VLOG(8) << "Push back inputs to VariableNameMap : an optioanl input "
+      VLOG(8) << "Push back inputs to VariableNameMap : an optional input "
               << name;
       continue;
     }
@@ -1004,7 +1005,7 @@ std::shared_ptr<OperatorBase> BuildOperatorBase(
         op_normalizer.GetLegacyArgName(fluid_op_name, output_name_list[i]);
 
     if (!IsInvalid(ptr)) {
-      VLOG(8) << "Push back outputs to VariableNameMap : an optioanl output "
+      VLOG(8) << "Push back outputs to VariableNameMap : an optional output "
               << legacy_arg_name;
       continue;
     }
diff --git a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
index 92072e6c9807f..0aa3fa0f80db2 100644
--- a/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
+++ b/paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h
@@ -190,7 +190,7 @@ void BuildPhiContext(pir::Operation* op,
         InType optional_input(temp);
         ctx->EmplaceBackInput(optional_input);
       }
-      VLOG(8) << "ctx->EmplaceBackInput : an optioanl input " << t;
+      VLOG(8) << "ctx->EmplaceBackInput : an optional input " << t;
       continue;
     }
 
@@ -441,7 +441,7 @@ void BuildPhiContext(pir::Operation* op,
         OutType optional_input(temp);
         ctx->EmplaceBackOutput(optional_input);
       }
-      VLOG(8) << "ctx->EmplaceBackOutput : an optioanl output";
+      VLOG(8) << "ctx->EmplaceBackOutput : an optional output";
       continue;
     }
 
diff --git a/paddle/fluid/pir/transforms/constant_folding_pass.cc b/paddle/fluid/pir/transforms/constant_folding_pass.cc
index 71c836cdcf96d..d7834f9195bfd 100644
--- a/paddle/fluid/pir/transforms/constant_folding_pass.cc
+++ b/paddle/fluid/pir/transforms/constant_folding_pass.cc
@@ -74,7 +74,7 @@ class ConstantFoldingPattern : public pir::RewritePattern {
   }
 
   bool Match(pir::Operation* op) const override {
-    VLOG(4) << "constant_folding_pass applys match on [" << op->name()
+    VLOG(4) << "constant_folding_pass applies match on [" << op->name()
             << "] op";
     // 1. Some ops do not need to be processed
     if (op->HasTrait<pir::SideEffectTrait>() ||
@@ -143,7 +143,7 @@ class ConstantFoldingPattern : public pir::RewritePattern {
       }
     }
 
-    // 7. maybe affect performence
+    // 7. maybe affect performance
     if (op->isa<paddle::dialect::FullOp>()) {
       auto next_ops = pir::GetUseOpsForOutput(op, 0);
       for (auto [next_op, _] : next_ops) {
@@ -161,7 +161,7 @@ class ConstantFoldingPattern : public pir::RewritePattern {
 
   void Rewrite(pir::Operation* op,
                pir::PatternRewriter& rewriter) const override {  // NOLINT
-    VLOG(4) << "constant_folding_pass applys rewrite on [" << op->name()
+    VLOG(4) << "constant_folding_pass applies rewrite on [" << op->name()
             << "] op";
     auto output_var_names = RunOp(op, rewriter);
 
@@ -410,7 +410,7 @@ class ConstantFoldingPatternForTrain : public ConstantFoldingPattern {
             context, suffix, place, scope, exe_config, deleted_vars) {}
 
   bool Match(pir::Operation* op) const override {
-    VLOG(4) << "constant_folding_pass applys match on [" << op->name()
+    VLOG(4) << "constant_folding_pass applies match on [" << op->name()
             << "] op";
     if (!ConstantFoldingPattern::Match(op)) {
       return false;
@@ -427,7 +427,7 @@ class ConstantFoldingPatternForTrain : public ConstantFoldingPattern {
 
   void Rewrite(pir::Operation* op,
                pir::PatternRewriter& rewriter) const override {  // NOLINT
-    VLOG(4) << "constant_folding_pass for train applys rewrite on ["
+    VLOG(4) << "constant_folding_pass for train applies rewrite on ["
             << op->name() << "] op";
 
     auto output_var_names = RunOp(op, rewriter);
@@ -454,7 +454,7 @@ class ConstantFoldingPatternForTrain : public ConstantFoldingPattern {
       rewriter.ReplaceAllUsesWith(op->result(i), constant_op->result(0));
     }
     rewriter.EraseOp(op);
-    VLOG(4) << "constant_folding_pass for traun applied rewrite on ["
+    VLOG(4) << "constant_folding_pass for train applied rewrite on ["
             << op->name() << "] op";
   }
 };
diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
index 958551d96e34f..03d65a920b9d2 100644
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -36,7 +36,7 @@
   // Kernel Dispatch Body{}
 }}
 """
-DIPATCH_END_GUARD_TEMPLATE = """
+DISPATCH_END_GUARD_TEMPLATE = """
 PADDLE_THROW(phi::errors::Unimplemented(
           "The kernel of ({}) for input tensors is unimplemented, please check the type of input tensors."));
 """
@@ -1899,7 +1899,7 @@ def gene_base_api_code(self, inplace_flag=False):
                 self.get_define_args(inplace_flag),
                 self.gene_kernel_select(),
                 kernel_dispatch_code
-                + DIPATCH_END_GUARD_TEMPLATE.format(self.api),
+                + DISPATCH_END_GUARD_TEMPLATE.format(self.api),
             )
         else:
             dist_branch_code = ""
@@ -1947,7 +1947,7 @@ def generate_api(
         if is_fused_ops_yaml is True
         else "paddle/phi/api/include/api.h"
     )
-    # not all fused ops supoort dygraph
+    # not all fused ops support dygraph
     if is_fused_ops_yaml is True:
         new_apis = [
             api

From 2ea42ce2847781cc2d68a5c8a07afa33bc645119 Mon Sep 17 00:00:00 2001
From: co63oc <co63oc@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:18:03 +0800
Subject: [PATCH 68/82]  Fix some typos(kernel_dialtion, etc) (#62013)

---
 .../group_with_group_merge_pass.cc            |  2 +-
 .../group_merge/group_with_group_merge_util.h |  2 +-
 paddle/cinn/hlir/framework/graph_compiler.cc  |  4 ++--
 paddle/cinn/hlir/framework/graph_compiler.h   |  2 +-
 paddle/cinn/hlir/framework/instruction.cc     | 18 ++++++++---------
 paddle/cinn/hlir/framework/pir_compiler.cc    |  6 +++---
 .../hlir/framework/print_graph_pass_test.cc   |  2 +-
 paddle/cinn/hlir/op/nn.cc                     | 18 ++++++++---------
 paddle/cinn/hlir/op/reduction.cc              | 12 +++++------
 paddle/cinn/hlir/pass/fusion_merge_pass.cc    | 20 +++++++++----------
 .../cinn/hlir/pass/fusion_merge_pass_util.h   |  2 +-
 .../hlir/pass/general_fusion_merge_pass.cc    | 16 +++++++--------
 .../graph_group_fuse_helper.h                 |  2 +-
 paddle/cinn/hlir/pass/op_fusion_pass_test.cc  | 12 +++++------
 paddle/cinn/hlir/pe/ir_schedule_pe.cc         |  4 ++--
 paddle/cinn/hlir/pe/transform.cc              |  2 +-
 16 files changed, 62 insertions(+), 62 deletions(-)

diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
index 40e03a6574832..7ee55cc7c9396 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_pass.cc
@@ -355,7 +355,7 @@ bool GraphGroupFuseHelper<FusePassCtxT>::AllOutputsSameSize(
 template <typename FusePassCtxT>
 bool GraphGroupFuseHelper<FusePassCtxT>::HorizontalElementwiseFuseReduce(
     const OpGroupPtr& src, const OpGroupPtr& dst) const {
-  return honrizontal_elementwise_fuse_reduce(src.GetGroup(), dst.GetGroup());
+  return horizontal_elementwise_fuse_reduce(src.GetGroup(), dst.GetGroup());
 }
 
 template <typename FusePassCtxT>
diff --git a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
index f1094dc78e796..f6c17ae28ebfb 100644
--- a/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
+++ b/paddle/cinn/hlir/dialect/operator/transforms/group_merge/group_with_group_merge_util.h
@@ -127,7 +127,7 @@ inline bool elementwise_fuse_broadcast(
   return true;
 }
 
-inline bool honrizontal_elementwise_fuse_reduce(
+inline bool horizontal_elementwise_fuse_reduce(
     const std::shared_ptr<ir::Group>& first,
     const std::shared_ptr<ir::Group>& second) {
   std::shared_ptr<ir::Group> ele_group, reduce_group;
diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc
index ffa599805f13e..4ed9ff14d217b 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -210,7 +210,7 @@ static void BufferMallocWithCallback(void* args, int num_args) {
   for (int i = 0; i < num_args; ++i) {
     cinn_buffer_t* buffer = static_cast<cinn_buffer_t*>(pod_args[i]);
     CHECK(buffer->external_malloc)
-        << "external_malloc is nullptr at " << i << "-th argumemnts";
+        << "external_malloc is nullptr at " << i << "-th arguments";
     buffer->external_malloc->operator()(nullptr, buffer);
   }
 }
@@ -282,7 +282,7 @@ void GraphCompiler::InsertBufferHandlers(
                                         malloc_var_names,
                                         std::vector<std::string>({}),
                                         function_name);
-      VLOG(4) << "seting malloc function " << function_name << " for var "
+      VLOG(4) << "setting malloc function " << function_name << " for var "
               << cinn::utils::Join(malloc_var_names, ", ");
       malloc_instr->SetLoweredFunc(
           reinterpret_cast<void*>(BufferMallocWithCallback), function_name);
diff --git a/paddle/cinn/hlir/framework/graph_compiler.h b/paddle/cinn/hlir/framework/graph_compiler.h
index d972fc856c825..01dca3e3f65a4 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.h
+++ b/paddle/cinn/hlir/framework/graph_compiler.h
@@ -85,7 +85,7 @@ class GraphCompiler final {
       const std::vector<std::unique_ptr<Instruction>>& instructions);
 
   // find the first and last instruction where a variable used, and mark the
-  // variable should allocate buffer before the first instruction runing and
+  // variable should allocate buffer before the first instruction running and
   // can release the buffer after the last instruction finished.
   void AnalyzeVariableLifeTime(
       const std::vector<std::unique_ptr<Instruction>>& instructions,
diff --git a/paddle/cinn/hlir/framework/instruction.cc b/paddle/cinn/hlir/framework/instruction.cc
index 7a85318654efc..c7185223843d5 100644
--- a/paddle/cinn/hlir/framework/instruction.cc
+++ b/paddle/cinn/hlir/framework/instruction.cc
@@ -168,9 +168,9 @@ void Instruction::Run(
                                         pod_args[2],
                                         static_cast<cudaStream_t>(stream));
   } else {
-    VLOG(3) << "Runing extern function " << function_name_;
+    VLOG(3) << "Running extern function " << function_name_;
     for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
-      VLOG(3) << "Runing func name: " << fn_names_[idx];
+      VLOG(3) << "Running func name: " << fn_names_[idx];
       auto& pod_args = args_cached_[idx];
       CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
                               "calling SetLoweredFunc method";
@@ -184,7 +184,7 @@ void Instruction::Run(
         }
       }
     }
-    VLOG(3) << "Done Runing extern function " << function_name_;
+    VLOG(3) << "Done Running extern function " << function_name_;
   }
 #elif defined(CINN_WITH_CUDNN)
   auto& pod_args = args_cached_[0];
@@ -315,9 +315,9 @@ void Instruction::Run(
                                         pod_args[2],
                                         static_cast<cudaStream_t>(stream));
   } else {
-    VLOG(3) << "Runing extern function " << function_name_;
+    VLOG(3) << "Running extern function " << function_name_;
     for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
-      VLOG(3) << "Runing func name: " << fn_names_[idx];
+      VLOG(3) << "Running func name: " << fn_names_[idx];
       auto& pod_args = args_cached_[idx];
       CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
                               "calling SetLoweredFunc method";
@@ -331,12 +331,12 @@ void Instruction::Run(
         }
       }
     }
-    VLOG(3) << "Done Runing extern function " << function_name_;
+    VLOG(3) << "Done Running extern function " << function_name_;
   }
 #else
-  VLOG(3) << "Runing extern function " << function_name_;
+  VLOG(3) << "Running extern function " << function_name_;
   for (int idx = 0; idx < fn_ptrs_.size(); ++idx) {
-    VLOG(3) << "Runing func name: " << fn_names_[idx];
+    VLOG(3) << "Running func name: " << fn_names_[idx];
     auto& pod_args = args_cached_[idx];
     CHECK(fn_ptrs_[idx]) << "The LoweredFunc address should be set first by "
                             "calling SetLoweredFunc method";
@@ -350,7 +350,7 @@ void Instruction::Run(
       }
     }
   }
-  VLOG(3) << "Done Runing extern function " << function_name_;
+  VLOG(3) << "Done Running extern function " << function_name_;
 #endif
 
   if (!cinn::runtime::CheckStringFlagFalse(FLAGS_cinn_self_check_accuracy)) {
diff --git a/paddle/cinn/hlir/framework/pir_compiler.cc b/paddle/cinn/hlir/framework/pir_compiler.cc
index f180a480c91dd..1cd7b0220b496 100644
--- a/paddle/cinn/hlir/framework/pir_compiler.cc
+++ b/paddle/cinn/hlir/framework/pir_compiler.cc
@@ -28,7 +28,7 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-// TODO(Aurelius84): Clear usless Build Interface.
+// TODO(Aurelius84): Clear useless Build Interface.
 std::unique_ptr<Program> PirCompiler::Build() {
   m_builder_.Clear();
   // NOTE(Aurelius84): Currently only support each op for one group
@@ -213,8 +213,8 @@ std::shared_ptr<Scope> BuildScope(const Target& target,
   };
 
   for (auto& op : *program.block()) {
-    for (auto oprand : op.operands()) {
-      create_var(oprand.source());
+    for (auto operand : op.operands()) {
+      create_var(operand.source());
     }
 
     for (auto result : op.results()) {
diff --git a/paddle/cinn/hlir/framework/print_graph_pass_test.cc b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
index b26c60c716d0c..0bb21aa41cb5b 100644
--- a/paddle/cinn/hlir/framework/print_graph_pass_test.cc
+++ b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
@@ -46,7 +46,7 @@ void PrintGraphPass(Graph* src) {
 
 CINN_REGISTER_PASS(PrintGraph)
     .describe(
-        "This pass just save the visulization Graph to "
+        "This pass just save the visualization Graph to "
         "g.attrs[\"print_graph\"].")
     .set_change_structure(false)
     .provide_graph_attr("print_graph")
diff --git a/paddle/cinn/hlir/op/nn.cc b/paddle/cinn/hlir/op/nn.cc
index 8aebede272568..60cbc1c89e222 100644
--- a/paddle/cinn/hlir/op/nn.cc
+++ b/paddle/cinn/hlir/op/nn.cc
@@ -662,7 +662,7 @@ std::shared_ptr<OpStrategy> StrategyForConv2dNCHWc(
         std::vector<Expr> kernel_shape = inputs[1]->shape;
         // kernel_h == 1 && kernel_w == 1
         CHECK_EQ(kernel_shape.size(), 6U)
-            << "kernel_dialtion shape size should be 6";
+            << "kernel_dilation shape size should be 6";
         bool is_1x1 =
             (is_zero(kernel_shape[2] - 1)) && (is_zero(kernel_shape[3] - 1));
         ir::Tensor res;
@@ -2224,18 +2224,18 @@ std::vector<framework::shape_t> InferShapeForBatchNormTrain(
     CHECK_EQ(inputs_shape[0][1], inputs_shape[2][0])
         << "x and bias dimension size is not equal!";
     CHECK_EQ(inputs_shape[0][1], inputs_shape[3][0])
-        << "x and moveing_mean dimension size is not equal!";
+        << "x and moving_mean dimension size is not equal!";
     CHECK_EQ(inputs_shape[0][1], inputs_shape[4][0])
-        << "x and moveing_variance dimension size is not equal!";
+        << "x and moving_variance dimension size is not equal!";
   } else if (data_layout == "NHWC") {
     CHECK_EQ(inputs_shape[0][3], inputs_shape[1][0])
         << "x and scale dimension is not equal!";
     CHECK_EQ(inputs_shape[0][3], inputs_shape[2][0])
         << "x and bias dimension size is not equal!";
     CHECK_EQ(inputs_shape[0][3], inputs_shape[3][0])
-        << "x and moveing_mean dimension size is not equal!";
+        << "x and moving_mean dimension size is not equal!";
     CHECK_EQ(inputs_shape[0][3], inputs_shape[4][0])
-        << "x and moveing_variance dimension size is not equal!";
+        << "x and moving_variance dimension size is not equal!";
   } else {
     LOG(FATAL) << "data_layout " << data_layout << " is not support!";
   }
@@ -2302,16 +2302,16 @@ std::vector<framework::shape_t> InferShapeForBatchNormGrad(
     CHECK_EQ(inputs_shape[0][1], inputs_shape[2][0])
         << "dy and bias dimension size is not equal!";
     CHECK_EQ(inputs_shape[0][1], inputs_shape[3][0])
-        << "dy and moveing_mean dimension size is not equal!";
+        << "dy and moving_mean dimension size is not equal!";
     CHECK_EQ(inputs_shape[0][1], inputs_shape[4][0])
-        << "dy and moveing_variance dimension size is not equal!";
+        << "dy and moving_variance dimension size is not equal!";
   } else if (data_layout == "NHWC") {
     CHECK_EQ(inputs_shape[0][3], inputs_shape[2][0])
         << "dy and bias dimension size is not equal!";
     CHECK_EQ(inputs_shape[0][3], inputs_shape[3][0])
-        << "dy and moveing_mean dimension size is not equal!";
+        << "dy and moving_mean dimension size is not equal!";
     CHECK_EQ(inputs_shape[0][3], inputs_shape[4][0])
-        << "dy and moveing_variance dimension size is not equal!";
+        << "dy and moving_variance dimension size is not equal!";
   } else {
     LOG(FATAL) << "data_layout " << data_layout << " is not support!";
   }
diff --git a/paddle/cinn/hlir/op/reduction.cc b/paddle/cinn/hlir/op/reduction.cc
index b50b6b108f954..a8fda43e0ceb5 100644
--- a/paddle/cinn/hlir/op/reduction.cc
+++ b/paddle/cinn/hlir/op/reduction.cc
@@ -188,7 +188,7 @@ std::shared_ptr<OpStrategy> StrategyForReduce(
     for (int i = 0; i < arg_pack.size(); i++) {
       if (arg_pack[i].is_expr()) {
         Expr temp = arg_pack[i];
-        // TODO(zhhsplendid): old reducetion schedule assumes all length-1
+        // TODO(zhhsplendid): old reduction schedule assumes all length-1
         // for loops are simplified, but it is not after we add length-1
         // back. Reduction schedule is complex and we haven't changed it to
         // support the length-1 for loop yet. So we simplify here. The todo
@@ -651,16 +651,16 @@ std::vector<std::vector<std::string>> InferLayoutForBnOptimize(
 }  // namespace cinn
 
 CINN_REGISTER_HELPER(reduce_ops) {
-#define CINN_REGISTER_REDUCTION_WITH_DTYPE(op__, op_stragegy__, dtype__)       \
+#define CINN_REGISTER_REDUCTION_WITH_DTYPE(op__, op_strategy__, dtype__)       \
   CINN_REGISTER_OP(op__)                                                       \
       .describe(#op__ " function")                                             \
       .set_num_inputs(1)                                                       \
       .set_num_outputs(1)                                                      \
       .set_attr<cinn::hlir::framework::StrategyFunction>(                      \
-          "CINNStrategy", cinn::hlir::op::StrategyFor##op_stragegy__)          \
+          "CINNStrategy", cinn::hlir::op::StrategyFor##op_strategy__)          \
       .set_attr<cinn::hlir::framework::StrategyFunctionSymbolic>(              \
           "CINNStrategySymbolic",                                              \
-          cinn::hlir::op::StrategyFor##op_stragegy__##Symbolic)                \
+          cinn::hlir::op::StrategyFor##op_strategy__##Symbolic)                \
       .set_attr("infershape",                                                  \
                 MakeOpFunction(cinn::hlir::op::InferShapeForReduction))        \
       .set_attr(                                                               \
@@ -674,8 +674,8 @@ CINN_REGISTER_HELPER(reduce_ops) {
           "OpPattern", cinn::hlir::framework::OpPatternKind::kReduction)       \
       .set_support_level(4);
 
-#define CINN_REGISTER_REDUCTION(op__, op_stragegy__) \
-  CINN_REGISTER_REDUCTION_WITH_DTYPE(op__, op_stragegy__, )
+#define CINN_REGISTER_REDUCTION(op__, op_strategy__) \
+  CINN_REGISTER_REDUCTION_WITH_DTYPE(op__, op_strategy__, )
 
   CINN_REGISTER_REDUCTION(reduce_sum, ReduceSum);
   CINN_REGISTER_REDUCTION(reduce_prod, ReduceProd);
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass.cc b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
index 86c0e5360fc0d..eb251fca8608e 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
@@ -233,7 +233,7 @@ class FusionMergePassHelper : public FusionHelperBase {
         break;
       }
 
-      // if can't fuse to othors Groups, new Groups.
+      // if can't fuse to other Groups, new Groups.
       if (!fusionable) {
         fusionable_consumers.push_back({candidate});
       }
@@ -488,7 +488,7 @@ class FusionMergePassHelper : public FusionHelperBase {
                         fusionable_consumers) {
     VLOG(3) << "VerticalFuse...!";
     GroupList fused_groups;
-    GroupPtr master_fuesd_group(nullptr);
+    GroupPtr master_fused_group(nullptr);
     for (auto& consumer : fusionable_consumers) {
       auto fused_group = std::make_shared<Graph::Group>();
       // update depth using consumer depth.
@@ -623,8 +623,8 @@ class FusionMergePassHelper : public FusionHelperBase {
       fusion_groups_[postion] = fused_group;
       fusion_groups_index_[fused_group] = postion;
 
-      if (!master_fuesd_group.get()) {
-        master_fuesd_group = fused_group;
+      if (!master_fused_group.get()) {
+        master_fused_group = fused_group;
       }
       CHECK(fused_group->output_nodes.size())
           << "No output node is found, " << fused_group->group_id;
@@ -654,8 +654,8 @@ class FusionMergePassHelper : public FusionHelperBase {
 
       if (be_output) {
         VLOG(4) << "Insert Id " << node->id() << " Into Group "
-                << master_fuesd_group->group_id;
-        master_fuesd_group->output_nodes.insert(node);
+                << master_fused_group->group_id;
+        master_fused_group->output_nodes.insert(node);
       }
     }
     // insert unfusionable consumer groups
@@ -663,10 +663,10 @@ class FusionMergePassHelper : public FusionHelperBase {
       if (fusionable_consumers.count(consumer)) {
         continue;
       }
-      master_fuesd_group->mut_consumer_groups()->insert(consumer);
+      master_fused_group->mut_consumer_groups()->insert(consumer);
       // update consumer's producer
       consumer->mut_producer_groups()->erase(producer);
-      consumer->mut_producer_groups()->insert(master_fuesd_group);
+      consumer->mut_producer_groups()->insert(master_fused_group);
     }
   }
 
@@ -979,7 +979,7 @@ class FusionMergePassHelper : public FusionHelperBase {
           // element-wise and injective op must be horizontal relation.
           {OpPatternKind::kInjective, is_same_size},
           // element-wise and reduce op must be horizontal relation.
-          {OpPatternKind::kReduction, honrizontal_elementwise_fuse_reduce}};
+          {OpPatternKind::kReduction, horizontal_elementwise_fuse_reduce}};
       // vertical
       relation.vertical_relation = {
           {OpPatternKind::kElementWise, is_same_size},
@@ -1044,7 +1044,7 @@ class FusionMergePassHelper : public FusionHelperBase {
       // horizontal
       relation.horizontal_relation = {
           // reduce and element-wise op must be horizontal relation.
-          {OpPatternKind::kElementWise, honrizontal_elementwise_fuse_reduce},
+          {OpPatternKind::kElementWise, horizontal_elementwise_fuse_reduce},
           // reduce and broadcast op must be horizontal relation.
           {OpPatternKind::kBroadcast, is_same_size},
           // reduce and injective op must be horizontal relation.
diff --git a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
index bc14748f5f648..219d08d7d08e6 100644
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
@@ -105,7 +105,7 @@ CONDITION_FUNC(elementwise_fuse_broadcast) {
   return true;
 }
 
-CONDITION_FUNC(honrizontal_elementwise_fuse_reduce) {
+CONDITION_FUNC(horizontal_elementwise_fuse_reduce) {
   std::shared_ptr<Graph::Group> ele_group, reduce_group;
   if (first->op_pattern_kind == framework::kReduction) {
     ele_group = second;
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
index cf1b91fcc1357..65d0d9eb7c243 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass.cc
@@ -69,7 +69,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
   }
 
   GroupList operator()() {
-    // run fusion merge untill no update.
+    // run fusion merge until no update.
     DoFusionMerge();
     for (auto& group : fusion_groups_) {
       VLOG(3) << "Fusion Group -> " << group->group_id;
@@ -564,7 +564,7 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
                         fusionable_consumers) {
     VLOG(3) << "VerticalFuse...!";
     GroupList fused_groups;
-    GroupPtr master_fuesd_group(nullptr);
+    GroupPtr master_fused_group(nullptr);
     for (auto& consumer : fusionable_consumers) {
       auto fused_group = std::make_shared<Graph::Group>(graph_);
       // update depth using consumer depth.
@@ -700,8 +700,8 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
       fusion_groups_[postion] = fused_group;
       fusion_groups_index_[fused_group] = postion;
 
-      if (!master_fuesd_group.get()) {
-        master_fuesd_group = fused_group;
+      if (!master_fused_group.get()) {
+        master_fused_group = fused_group;
       }
       CHECK(fused_group->output_nodes.size())
           << "No output node is found, " << fused_group->group_id;
@@ -731,8 +731,8 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
 
       if (be_output) {
         VLOG(4) << "Insert Id " << node->id() << " Into Group "
-                << master_fuesd_group->group_id;
-        master_fuesd_group->output_nodes.insert(node);
+                << master_fused_group->group_id;
+        master_fused_group->output_nodes.insert(node);
       }
     }
     // insert unfusionable consumer groups
@@ -740,10 +740,10 @@ class GeneralFusionMergePassHelper : public FusionHelperBase {
       if (fusionable_consumers.count(consumer)) {
         continue;
       }
-      master_fuesd_group->mut_consumer_groups()->insert(consumer);
+      master_fused_group->mut_consumer_groups()->insert(consumer);
       // update consumer's producer
       consumer->mut_producer_groups()->erase(producer);
-      consumer->mut_producer_groups()->insert(master_fuesd_group);
+      consumer->mut_producer_groups()->insert(master_fused_group);
     }
   }
 
diff --git a/paddle/cinn/hlir/pass/general_fusion_merge_pass/graph_group_fuse_helper.h b/paddle/cinn/hlir/pass/general_fusion_merge_pass/graph_group_fuse_helper.h
index 3859ad88ff016..f3f2802ac3007 100644
--- a/paddle/cinn/hlir/pass/general_fusion_merge_pass/graph_group_fuse_helper.h
+++ b/paddle/cinn/hlir/pass/general_fusion_merge_pass/graph_group_fuse_helper.h
@@ -138,7 +138,7 @@ bool GraphGroupFuseHelper<FusePassCtxT>::AllOutputsSameSize(
 template <typename FusePassCtxT>
 bool GraphGroupFuseHelper<FusePassCtxT>::HorizontalElementwiseFuseReduce(
     const OpGroupPtr& src, const OpGroupPtr& dst) const {
-  return honrizontal_elementwise_fuse_reduce(
+  return horizontal_elementwise_fuse_reduce(
       &ctx_->graph_group_fusion_helper(), src.GetGroup(), dst.GetGroup());
 }
 
diff --git a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
index 885afd929ba87..c9d723c91be50 100755
--- a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
@@ -66,9 +66,9 @@ TEST(OpFusionPass, ElementWise_Fusion_1) {
   CHECK_EQ(graph->fusion_groups.size(), 1);
 }
 
-TEST(OpFusionPass, Brodcast_Test_0) {
+TEST(OpFusionPass, Broadcast_Test_0) {
   int h = 32, w = 32;
-  NetBuilder net_builder("Brodcast_Test_0");
+  NetBuilder net_builder("Broadcast_Test_0");
   // create model
   {
     auto A = net_builder.CreateInput(Float(32), {w}, "A");
@@ -89,9 +89,9 @@ TEST(OpFusionPass, Brodcast_Test_0) {
   CHECK_EQ(graph->fusion_groups.size(), 1);
 }
 
-TEST(OpFusionPass, Brodcast_Test_1) {
+TEST(OpFusionPass, Broadcast_Test_1) {
   int h = 32, w = 32;
-  NetBuilder net_builder("Brodcast_Test_1");
+  NetBuilder net_builder("Broadcast_Test_1");
   // create model
   {
     auto A = net_builder.CreateInput(Float(32), {w}, "A");
@@ -114,9 +114,9 @@ TEST(OpFusionPass, Brodcast_Test_1) {
   CHECK_EQ(graph->fusion_groups.size(), 1);
 }
 
-TEST(OpFusionPass, Brodcast_Test_2) {
+TEST(OpFusionPass, Broadcast_Test_2) {
   int n = 2, c = 16, h = 32, w = 32;
-  NetBuilder net_builder("Brodcast_Test_2");
+  NetBuilder net_builder("Broadcast_Test_2");
   // create model
   {
     auto A = net_builder.CreateInput(Float(32), {c}, "A");
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
index 2c27c98d5faf9..36052d25f8a44 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -611,7 +611,7 @@ void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
   }
 
   // bind block and thread for reduce.
-  // as outer loop range should be eqaul, get loop size.
+  // as outer loop range should be equal, get loop size.
   auto b_loop = ir::GetLoopExtent(ir_sch.GetLoops(out->name)[0]);
   // reduce_tmp_out
   {
@@ -784,7 +784,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
       }
       return loop_var_count;
     }
-    LOG(FATAL) << "Can't find var in tensor indeces!";
+    LOG(FATAL) << "Can't find var in tensor indexes!";
   };
   auto loop_var_count = get_loop_index(ir_sch.GetLoops(reduce_out->name).back(),
                                        ir_sch.GetBlock(reduce_out->name));
diff --git a/paddle/cinn/hlir/pe/transform.cc b/paddle/cinn/hlir/pe/transform.cc
index 324e886195f60..2e78caca83206 100644
--- a/paddle/cinn/hlir/pe/transform.cc
+++ b/paddle/cinn/hlir/pe/transform.cc
@@ -1174,7 +1174,7 @@ ir::Tensor SliceAssign(const ir::Tensor& input,
       new_strides[i] = -new_strides[i];
     } else {
       CHECK_LT(new_starts[i], new_ends[i])
-          << "[ends] shoould greater than [starts] when [strides] > 0";
+          << "[ends] should greater than [starts] when [strides] > 0";
     }
   }
 

From 1b68a51dbdc6b4e93a0c8e28df74e8d881272501 Mon Sep 17 00:00:00 2001
From: JYChen <zoooo0820@qq.com>
Date: Mon, 26 Feb 2024 11:23:20 +0800
Subject: [PATCH 69/82] fix shape error in combine-getitem (#61922)

---
 paddle/fluid/pybind/eager_method.cc  | 10 ++++----
 python/paddle/base/variable_index.py |  4 ++--
 test/indexing/test_getitem.py        | 35 ++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 9dc8897a10a41..09fb067f41dee 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1404,14 +1404,14 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
 
   if (pos_of_new_dim != 0) {
     std::vector<int> perm(out.shape().size(), 0);
-    int tmp1 = pos_of_new_dim, tmp2 = 0,
+    int tmp1 = rank_of_new_dim, tmp2 = 0,
         tmp3 = pos_of_new_dim + rank_of_new_dim;
     for (int i = 0; i < static_cast<int>(out.shape().size()); ++i) {
-      if (i < rank_of_new_dim) {
+      if (i < pos_of_new_dim) {
         perm[i] =
-            tmp1++;  // range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim)
-      } else if (i >= rank_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) {
-        perm[i] = tmp2++;  // range(0, pos_of_new_dim)
+            tmp1++;  // range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim)
+      } else if (i >= pos_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) {
+        perm[i] = tmp2++;  // range(0, rank_of_new_dim)
       } else {
         perm[i] = tmp3++;  // range(pos_of_new_dim + rank_of_new_dim, out.ndim)
       }
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index 6ccfe1c6164d2..0d7704272df61 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -908,8 +908,8 @@ def _getitem_static(x, indices):
 
         if pos_of_new_dim != 0:
             perm = (
-                list(range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim))
-                + list(range(0, pos_of_new_dim))
+                list(range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim))
+                + list(range(0, rank_of_new_dim))
                 + list(range(pos_of_new_dim + rank_of_new_dim, out.ndim))
             )
             out = out.transpose(perm)
diff --git a/test/indexing/test_getitem.py b/test/indexing/test_getitem.py
index f840042c57e09..bf700e4986e57 100644
--- a/test/indexing/test_getitem.py
+++ b/test/indexing/test_getitem.py
@@ -234,6 +234,26 @@ def test_combined_index_11(self):
 
         np.testing.assert_allclose(y.numpy(), np_res)
 
+    def test_combined_index_12(self):
+        np_data = (
+            np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)).astype(self.ndtype)
+        )
+
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+
+        np_res = np_data[:, :, [2, 4], :]
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        y = x[:, :, [2, 4], :]
+
+        if self.dtype == 'bfloat16':
+            y = paddle.cast(y, dtype='float32')
+
+        np.testing.assert_allclose(y.numpy(), np_res)
+
     def test_index_has_range(self):
         np_data = (
             np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)).astype(self.ndtype)
@@ -982,6 +1002,21 @@ def test_combined_index_11(self):
 
         np.testing.assert_allclose(res[0], np_res)
 
+    @test_with_pir_api
+    def test_combined_index_12(self):
+        np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6))
+        np_res = np_data[:, :, [2, 4], :]
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.to_tensor(np_data)
+            y = _getitem_static(
+                x, (slice(None), slice(None), [2, 4], slice(None))
+            )
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_res)
+
     @test_with_pir_api
     def test_index_has_range(self):
         # only one bool tensor with all False

From e19e3c9435ee71ac844d78f98a34265ac7a73589 Mon Sep 17 00:00:00 2001
From: feifei-111 <2364819892@qq.com>
Date: Mon, 26 Feb 2024 11:23:52 +0800
Subject: [PATCH 70/82] [SOT] rewrite resume function generation (#62012)

---
 .../executor/function_graph.py                |  71 +-
 .../executor/opcode_executor.py               | 850 ++++++++++--------
 .../executor/opcode_inline_executor.py        |  10 -
 .../executor/pycode_generator.py              | 217 ++---
 .../sot/opcode_translator/executor/tracker.py |   2 +-
 .../instruction_utils/__init__.py             |   5 +-
 .../instruction_utils/instruction_pass.py     |   2 +
 .../instruction_utils/instruction_utils.py    |   8 +
 .../instruction_utils/opcode_analysis.py      | 109 +--
 test/sot/test_11_jumps.py                     |  12 +
 test/sot/test_analysis_inputs.py              |   8 +-
 11 files changed, 609 insertions(+), 685 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
index 8f87e19cd4d28..dc57b252e00c2 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/function_graph.py
@@ -39,6 +39,7 @@
 from ...utils import (
     ENV_SHOW_TRACKERS,
     NameGenerator,
+    SotUndefinedVar,
     inner_error_default_handler,
     is_inplace_api,
     is_paddle_api,
@@ -140,6 +141,20 @@ def get_params_and_non_param_symbol(*args, **kwargs):
     return params, non_params
 
 
+class VariableLoader:
+    def __init__(self, store_var_info, pycode_gen):
+        self._store_var_info = store_var_info
+        self._pycode_gen: PyCodeGen = pycode_gen
+
+    def load(self, var):
+        if var is SotUndefinedVar():
+            self._pycode_gen.gen_load_const(SotUndefinedVar())
+        elif isinstance(var, NullVariable):
+            var.reconstruct(self._pycode_gen)
+        else:
+            self._pycode_gen.gen_load(self._store_var_info[var.id])
+
+
 class FunctionGraph:
     """
     A Graph representation corresponding to each FunctionFrame
@@ -281,17 +296,6 @@ def guard_fn(self) -> Guard:
             return make_guard(guards)
 
     def _restore_origin_opcode(self, stack_vars, store_var_info, instr_idx):
-        class VariableLoader:
-            def __init__(self, store_var_info, pycode_gen):
-                self._store_var_info = store_var_info
-                self._pycode_gen: PyCodeGen = pycode_gen
-
-            def load(self, var):
-                if isinstance(var, NullVariable):
-                    var.reconstruct(self._pycode_gen)
-                    return
-                self._pycode_gen.gen_load(self._store_var_info[var.id])
-
         origin_instrs = get_instructions(self.pycode_gen._origin_code)
         is_precall = origin_instrs[instr_idx].opname == "PRECALL"
         current_idx = instr_idx
@@ -308,7 +312,7 @@ def load(self, var):
             restore_instr_names = restore_instr_names[:-1]
 
         self.pycode_gen.extend_instrs(restore_instrs)
-        nop = self.pycode_gen._add_instr("NOP")
+        nop = self.pycode_gen.add_instr("NOP")
 
         for instr in origin_instrs:
             if instr.jump_to == origin_instrs[current_idx]:
@@ -324,26 +328,21 @@ def load(self, var):
 
         name_gen = NameGenerator("__start_compile_saved_orig_")
 
+        # here is not update changed values, it just give names to stack vars
+        # and want keep same interface as _build_compile_fn_with_name_store
         for var in stack_vars[::-1]:
-            store_var_info[var.id] = name_gen.next()
-            self.pycode_gen.gen_store_fast(store_var_info[var.id])
+            if store_var_info[var.id] is None:
+                store_var_info[var.id] = name_gen.next()
+                self.pycode_gen.gen_store_fast(store_var_info[var.id])
+            else:
+                self.pycode_gen.gen_store(
+                    store_var_info[var.id], self.pycode_gen._origin_code
+                )
 
         return VariableLoader(store_var_info, self.pycode_gen)
 
-    def _build_compile_fn_with_name_store(self, to_store_vars):
-        class VariableLoader:
-            def __init__(self, index_for_load, pycode_gen):
-                self._index_for_load = index_for_load
-                self._pycode_gen: PyCodeGen = pycode_gen
-
-            def load(self, var, allow_push_null=True):
-                if isinstance(var, NullVariable):
-                    var.reconstruct(self._pycode_gen)
-                    return
-                self._pycode_gen.gen_load(self._index_for_load[var.id])
-
+    def _build_compile_fn_with_name_store(self, to_store_vars, store_var_info):
         # var_id -> local_name mapping
-        index_for_load = {}
         to_store_vars = list(
             filter(lambda x: not isinstance(x, NullVariable), to_store_vars)
         )
@@ -351,19 +350,15 @@ def load(self, var, allow_push_null=True):
         name_gen = NameGenerator("__start_compile_saved_")
 
         for var in to_store_vars[::-1]:
-            index_for_load[var.id] = name_gen.next()
-
-            def _log_fn():
-                print(
-                    f"[StartCompile] saved var: {index_for_load[var.id]} = ",
-                    var,
+            if store_var_info[var.id] is None:
+                store_var_info[var.id] = name_gen.next()
+                self.pycode_gen.gen_store_fast(store_var_info[var.id])
+            else:
+                self.pycode_gen.gen_store(
+                    store_var_info[var.id], self.pycode_gen._origin_code
                 )
 
-            log_do(4, _log_fn)
-
-            self.pycode_gen.gen_store_fast(index_for_load[var.id])
-
-        return VariableLoader(index_for_load, self.pycode_gen)
+        return VariableLoader(store_var_info, self.pycode_gen)
 
     def get_compiled_fn(self, *ret_vars):
         ret_items = [
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index e9a985e5b728c..e0ada6a9b74fa 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -27,8 +27,6 @@
 
 import opcode
 
-from paddle.jit.utils import OrderedSet
-
 from ...profiler import EventGuard, event_register
 from ...psdb import NO_BREAKGRAPH_CODES
 from ...utils import (
@@ -45,8 +43,7 @@
 from ..instruction_utils import (
     Instruction,
     Space,
-    analysis_inputs,
-    analysis_used_names_with_space,
+    analysis_used_names,
     calc_stack_effect,
     get_instructions,
 )
@@ -416,7 +413,34 @@ def transform(self):
         """
         raise NotImplementedError()
 
-    def get_var(self, name: str):
+    def find_space_of_var_name(self, name):
+        code = self._graph.pycode_gen._origin_code
+        if name in (code.co_freevars + code.co_cellvars):
+            return Space.cells
+        elif name in code.co_varnames:
+            return Space.locals
+        elif name in code.co_names:
+            return Space.globals
+        else:
+            return Space.not_found
+
+    def has_var(self, name: str):
+        space = self.find_space_of_var_name(name)
+
+        if space == Space.locals:
+            return name in self._locals
+        elif space == Space.cells:
+            return name in self._cells
+        elif space == Space.globals:
+            return name in set(
+                chain(
+                    self._globals.keys(),
+                    self._builtins.keys(),
+                )
+            )
+        return False
+
+    def get_var(self, name: str, allow_undefined=False):
         """
         Gets the variable with the given name.
 
@@ -438,31 +462,27 @@ def get_var(self, name: str):
             return self._globals.get(name)
         elif name in self._builtins.keys():
             return self._builtins[name]
+        elif allow_undefined:
+            return SotUndefinedVar()
         else:
             raise InnerError(f'Can not get var: {name}')
 
-    def has_var(self, name: str, space: str = "any"):
-        if space == "any":
-            return name in set(
-                chain(
-                    self._locals.keys(),
-                    self._cells.keys(),
-                    self._globals.keys(),
-                    self._builtins.keys(),
-                )
-            )
-        elif space == Space.locals:
-            return name in self._locals
+    def set_var(self, name: str, value: VariableBase):
+        space = self.find_space_of_var_name(name)
+
+        # if name is new created, we always place it to locals
+        if space in (Space.locals, Space.not_found):
+            self._locals[name] = value
         elif space == Space.cells:
-            return name in self._cells
+            self._cells[name].set_value(value)
         elif space == Space.globals:
-            return name in set(
-                chain(
-                    self._globals.keys(),
-                    self._builtins.keys(),
-                )
-            )
-        return False
+            self._globals[name] = value
+
+    def _find_names_in_space(self, names, space):
+        target_names = [
+            name for name in names if self.find_space_of_var_name(name) in space
+        ]
+        return target_names
 
     def pop_call_stack_until_self(self):
         """
@@ -1511,6 +1531,31 @@ def __init__(self, frame: types.FrameType, **kwargs):
         super().__init__(frame.f_code, graph)
         Dispatcher.graph = graph
 
+    def transform(self):
+        static_function = get_static_function(self._frame, "eval_frame")
+        if static_function is not None:
+            code = self._frame.f_code
+            inputs = []
+            for i in range(code.co_argcount):
+                arg_name = code.co_varnames[i]
+                value = self._locals[arg_name]
+                inputs.append(value)
+            output = self._graph.call_ast(static_function, *inputs)
+            if output is not None:
+                self.stack.push(output)
+                self.RETURN_VALUE(None)
+                return (
+                    CustomCode(self.new_code, self.new_code is None),
+                    self.guard_fn,
+                )
+        self.run()
+        if self.new_code is self.empty_code:
+            raise InnerError("OpExecutor return a empty new_code.")
+        return (
+            CustomCode(self.new_code, self.new_code is None),
+            self.guard_fn,
+        )
+
     def cleanup(self):
         self._graph.pycode_gen = None
         Dispatcher.graph = None
@@ -1560,56 +1605,99 @@ def _prepare_virtual_env(self):
                 )
             )
 
-    def gen_compute_in_break_with_name_store(self, restore_names, instr_idx):
+    def FOR_ITER(self, instr):
+        iterator = self.stack.pop()
+        backup_iter_idx = None
+
+        start = self.indexof(instr)
+        end = self.indexof(instr.jump_to)
+        for i in range(start, end):
+            if self._instructions[i].opname == "RETURN_VALUE":
+                raise FallbackError("Found RETURN_VALUE in for loop body.")
+
+        self._graph.add_global_guarded_variable(iterator)
+
+        try:
+            if not isinstance(iterator, SequenceIterVariable):
+                raise BreakGraphError(
+                    f"Can not simulate iterator of {type(iterator)}."
+                )
+
+            backup_iter_idx = iterator.idx
+
+            self._inline_call_for_loop(iterator, instr)
+            self._lasti = self.indexof(instr.jump_to)
+            next_instr = self._instructions[self._lasti]
+            self._lasti += int(next_instr.opname == 'END_FOR')
+        except BreakGraphError as e:
+            log(3, f"[BreakGraph] FOR_ITER sim for loop failed for: {e}\n")
+            if backup_iter_idx:
+                iterator.idx = backup_iter_idx
+            self._graph.remove_global_guarded_variable(iterator)
+            self.stack.push(iterator)
+            self._break_graph_when_for_loop(iterator, instr)
+            return Stop(state="BreakGraph")
+
+    def RETURN_VALUE(self, instr: Instruction):
+        assert (
+            len(self.stack) == 1
+        ), f"Stack must have one element, but get {len(self.stack)} elements."
+        ret_val = self.stack.pop()
+        return self.compile_return(ret_val)
+
+    def RETURN_CONST(self, instr: Instruction):
+        ret_const = self._co_consts[instr.arg]
+        return self.compile_return(ret_const)
+
+    def compile_return(self, ret_val):
+        compile_fn = self._graph.get_compiled_fn(ret_val)
+        if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
+            self.new_code = None
+        else:
+            self._graph.start_compile(ret_val)
+            self._graph.pycode_gen.gen_return()
+            self.new_code = self._graph.pycode_gen.gen_pycode()
+        self.guard_fn = self._graph.guard_fn
+        return Stop(state="Return")
+
+    def get_compute_fn_and_update_changed_vars(
+        self, restore_names, stack, end_idx
+    ):
         """
-        branch 1: if the graph size is too small, just run in dygraph
-        branch 2: if the graph is big enough, create compiled_fn
-
-        This api will generator opcodes in different situation, the generated codes
-        will do the same thing as origin code.
-
-        restore_names:
-            the names used in resume functions, branch 2 will restore these values,
-            branch 1 also need these names for generating opcode, but they are not
-            needed to be restored
-        instr_idx:
-            the index for branch 1 to find the boundary and copy origin opcode
+        this function will:
+        1. add opcodes to self._graph.pycode_gen, which do the same thing as origin code.
+        2. update the value of whom would be changed in generated codes
+
+        This api will generator opcodes in different situation,
+        branch 1: if the graph size is too small, just run in dygraph.
+        branch 2: if the graph is big enough, create compiled_fn.
+
+        Params:
+            restore_names: the names used in resume functions.
+            end_idx: instruction index where simulation get break.
+            stack: current stack
         """
-        # if we want get compiled fn, and do not do ast twice,
-        # we must give retval to get_compiled_fn which strictly same as start_compile
-        store_vars = list(self.stack)
-        store_var_info = {}
+        store_vars = list(stack)
+        store_var_info = {var.id: None for var in stack}
 
         for name in restore_names:
-            _var = self.get_var(name)
-            if _var not in self.stack:
+            _var = self.get_var(name, allow_undefined=True)
+            if _var is SotUndefinedVar():
+                continue
+            if _var not in stack:
                 store_vars.append(_var)
-                store_var_info[_var.id] = name
+            store_var_info[_var.id] = name
 
         compile_fn = self._graph.get_compiled_fn(*store_vars)
 
         if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
             return self._graph._restore_origin_opcode(
-                list(self.stack), store_var_info, instr_idx
+                list(stack), store_var_info, end_idx
             )
         else:
-            return self._graph._build_compile_fn_with_name_store(store_vars)
-
-    def _create_resume_fn(self, index, stack_size):
-        """
-        Create a resume function and its inputs at the specified index.
-
-        Args:
-            index: The index at which the resume function is created.
-            stack_size: The size of the stack.
-
-        Returns:
-            The resume function and its inputs.
-
-        """
-        pycode_gen = PyCodeGen(self._frame)
-        fn, inputs = pycode_gen.gen_resume_fn_at(index, stack_size)
-        return fn, inputs
+            return self._graph._build_compile_fn_with_name_store(
+                store_vars, store_var_info
+            )
 
     @fallback_when_occur_error
     def _break_graph_when_if(self, result: TensorVariable, instr: Instruction):
@@ -1622,66 +1710,105 @@ def _break_graph_when_if(self, result: TensorVariable, instr: Instruction):
 
         """
         self._graph.add_global_guarded_variable(result)
-        # minus the bool value
-        stack_size = len(self.stack) - 1
 
-        # gen call static fn opcode
-        if_fn, if_inputs = self._create_resume_fn(
-            self.indexof(instr) + 1, stack_size
+        # 1. analyse info
+        cur_index = self.indexof(instr)
+        true_fn_start_index = cur_index + 1
+        false_fn_start_index = self.indexof(instr.jump_to)
+        stack_size_after_if = len(self.stack) - 1
+
+        # 2. create true_fn and false_fn
+        def create_if_branch_fn(start_idx, input_var_names):
+            if self._instructions[start_idx].opname == "RETURN_VALUE":
+                return None
+            pycode_gen = PyCodeGen(self._frame)
+            origin_instrs = get_instructions(pycode_gen._origin_code)
+            pycode_gen.set_function_inputs(
+                input_var_names, stack_size=stack_size_after_if
+            )
+            pycode_gen.extend_instrs(origin_instrs[start_idx:])
+            # the resume_fn contains return code, so we don't need set output here
+            # global vars are updated correctly, and need local vars will return
+            resume_fn = pycode_gen.create_function()
+            return resume_fn
+
+        true_fn_read_names, _ = analysis_used_names(
+            self._instructions, self.indexof(instr) + 1
+        )
+        true_fn_input_var_names = self._find_names_in_space(
+            true_fn_read_names, (Space.locals, Space.cells)
+        )
+
+        true_fn = create_if_branch_fn(
+            start_idx=true_fn_start_index,
+            input_var_names=true_fn_input_var_names,
+        )
+
+        false_fn_read_names, _ = analysis_used_names(
+            self._instructions, self.indexof(instr.jump_to)
+        )
+        false_fn_input_var_names = self._find_names_in_space(
+            false_fn_read_names, (Space.locals, Space.cells)
         )
-        else_fn, else_inputs = self._create_resume_fn(
-            self.indexof(instr.jump_to), stack_size
+
+        false_fn = create_if_branch_fn(
+            start_idx=false_fn_start_index,
+            input_var_names=false_fn_input_var_names,
         )
 
-        inputs_names = if_inputs | else_inputs
+        # 4. setup vars which is created in loop as Undefind
+        for name in true_fn_input_var_names[:-1]:
+            if not self.has_var(name):
+                self._graph.pycode_gen.gen_load_const(SotUndefinedVar())
+                self._graph.pycode_gen.gen_store(name, self._code)
+        for name in false_fn_input_var_names:
+            if not self.has_var(name):
+                self._graph.pycode_gen.gen_load_const(SotUndefinedVar())
+                self._graph.pycode_gen.gen_store(name, self._code)
 
-        var_loader = self.gen_compute_in_break_with_name_store(
-            inputs_names, self.indexof(instr)
+        # 4. compile codes before if
+        update_var_names = list(true_fn_read_names | false_fn_read_names)
+        var_loader = self.get_compute_fn_and_update_changed_vars(
+            update_var_names, self.stack, cur_index
         )
 
+        # 5. create if sturcture and call true_fn and false_fn
         var_loader.load(result)
-        # the result is used by if opcode, and should not be input of resume_fn
-        self.stack.pop()
+        if_code = self._graph.pycode_gen.add_instr(instr.opname)
 
-        # gen call if/else resume fn opcode
-        if if_fn is not None:
-            self._graph.pycode_gen.gen_load_object(
-                if_fn, if_fn.__code__.co_name
-            )
-            insert_index = len(self._graph.pycode_gen._instructions) - 1
-            for i, stack_arg in enumerate(self.stack):
-                var_loader.load(stack_arg)
-            for name in if_inputs:
-                var_loader.load(self.get_var(name))
-            self._graph.pycode_gen.gen_call_function(
-                argc=if_fn.__code__.co_argcount,
-            )
-            self._graph.pycode_gen.gen_return()
-        else:
-            insert_index = len(self._graph.pycode_gen._instructions) - 1
-            self._graph.pycode_gen.gen_return()
+        assert true_fn is not None
 
-        if else_fn is not None:
-            self._graph.pycode_gen.gen_load_object(
-                else_fn, else_fn.__code__.co_name
+        self._graph.pycode_gen.gen_load_object(
+            true_fn, true_fn.__code__.co_name
+        )
+        for stack_arg in list(self.stack)[:-1]:
+            var_loader.load(stack_arg)
+
+        for name in true_fn_input_var_names:
+            var_loader.load(self.get_var(name, allow_undefined=True))
+
+        self._graph.pycode_gen.gen_call_function(
+            argc=true_fn.__code__.co_argcount,
+        )
+        self._graph.pycode_gen.gen_return()
+
+        if false_fn is not None:
+            false_start_code = self._graph.pycode_gen.gen_load_object(
+                false_fn, false_fn.__code__.co_name
             )
-            jump_to = self._graph.pycode_gen._instructions[-1]
-            for i, stack_arg in enumerate(self.stack):
+            for stack_arg in list(self.stack)[:-1]:
                 var_loader.load(stack_arg)
-            for name in else_inputs:
-                var_loader.load(self.get_var(name))
+            for name in false_fn_input_var_names:
+                var_loader.load(self.get_var(name, allow_undefined=True))
+
             self._graph.pycode_gen.gen_call_function(
-                argc=else_fn.__code__.co_argcount,
+                argc=false_fn.__code__.co_argcount,
             )
             self._graph.pycode_gen.gen_return()
         else:
-            self._graph.pycode_gen.gen_return()
-            jump_to = self._graph.pycode_gen._instructions[-1]
+            false_start_code = self._graph.pycode_gen.gen_return()
 
-        # gen jump opcode
-        self._graph.pycode_gen._insert_instr(
-            insert_index, instr.opname, jump_to=jump_to
-        )
+        if_code.jump_to = false_start_code
 
         self.new_code = self._graph.pycode_gen.gen_pycode()
         self.guard_fn = self._graph.guard_fn
@@ -1702,41 +1829,60 @@ def _break_graph_when_call(
             push_n: The number of elements to be pushed onto the stack.
 
         """
+        self.stack = origin_stack
+
+        # 1. collect infomations
         push_n = push_n(instr.arg) if callable(push_n) else push_n
         is_precall = instr.opname == "PRECALL"
-        index = self.indexof(instr)
+        cur_index = self.indexof(instr)
         # Use CALL instead of PRECALL to calculate the real stack effect
-        call_instr = self._instructions[index + int(is_precall)]
+        call_instr = self._instructions[cur_index + int(is_precall)]
         # skip CALL if current instr is PRECALL
-        next_index = index + 1 + int(is_precall)
-        self.stack = origin_stack
-
-        # gen call static fn opcode
+        next_index = cur_index + 1 + int(is_precall)
+        stack_effect = calc_stack_effect(call_instr)
+        pop_n = push_n - stack_effect
+        stack_size_after_call = len(self.stack) - pop_n + push_n
 
-        resume_input_name = analysis_inputs(self._instructions, next_index)
+        # 2. create resume function
+        read_names, _ = analysis_used_names(self._instructions, next_index)
 
-        var_loader = self.gen_compute_in_break_with_name_store(
-            resume_input_name, index
+        input_var_names = self._find_names_in_space(
+            read_names, (Space.locals, Space.cells)
         )
 
-        # gen graph break call fn opcode
-        stack_effect = calc_stack_effect(call_instr)
-        pop_n = push_n - stack_effect
+        def create_resume_fn():
+            if self._instructions[next_index].opname == "RETURN_VALUE":
+                return None
+            pycode_gen = PyCodeGen(self._frame)
+            origin_instrs = get_instructions(pycode_gen._origin_code)
+            pycode_gen.set_function_inputs(
+                input_var_names, stack_size=stack_size_after_call
+            )
+            pycode_gen.extend_instrs(origin_instrs[next_index:])
+            # the resume_fn contains return code, so we don't need set output here
+            # global vars are updated correctly, and need local vars will return
+            resume_fn = pycode_gen.create_function()
+            return resume_fn
 
-        for i, stack_arg in enumerate(self.stack):
+        resume_fn = create_resume_fn()
+
+        # 3. compile sub graph before call
+        var_loader = self.get_compute_fn_and_update_changed_vars(
+            read_names, self.stack, cur_index
+        )
+
+        # 4. recover stack
+        for stack_arg in self.stack:
             var_loader.load(stack_arg)
 
-        # gen call resume fn opcode
+        # 5. run the break CALL with origin python
         # NOTE(SigureMo): In Python 3.11，we need generate KW_NAMES if the call shape is not None.
         self._graph.pycode_gen.gen_kw_names(self._call_shape)
         self._graph.pycode_gen.extend_instrs(
-            self._instructions[index:next_index]
+            self._instructions[cur_index:next_index]
         )
-        self.stack.pop_n(pop_n)
-        stack_size = len(self.stack) + push_n
-
-        resume_fn, _ = self._create_resume_fn(next_index, stack_size)
 
+        # 6. run resume fn
         if resume_fn:
             self._graph.pycode_gen.gen_load_object(
                 resume_fn, resume_fn.__code__.co_name
@@ -1744,9 +1890,11 @@ def _break_graph_when_call(
             # NOTE(zrr1999): We need to shift the resume_fn under its arguments.
             # In Python 3.11+, NULL + resume_fn should be shifted together.
             shift_n = 2 if sys.version_info >= (3, 11) else 1
-            self._graph.pycode_gen.gen_shift_n(shift_n, stack_size + shift_n)
-            for name in resume_input_name:
-                var_loader.load(self.get_var(name))
+            self._graph.pycode_gen.gen_shift_n(
+                shift_n, stack_size_after_call + shift_n
+            )
+            for name in input_var_names:
+                var_loader.load(self.get_var(name, allow_undefined=True))
             self._graph.pycode_gen.gen_call_function(
                 argc=resume_fn.__code__.co_argcount,
             )
@@ -1757,112 +1905,14 @@ def _break_graph_when_call(
         self.new_code = self._graph.pycode_gen.gen_pycode()
         self.guard_fn = self._graph.guard_fn
 
-    def transform(self):
-        static_function = get_static_function(self._frame, "eval_frame")
-        if static_function is not None:
-            code = self._frame.f_code
-            inputs = []
-            for i in range(code.co_argcount):
-                arg_name = code.co_varnames[i]
-                value = self._locals[arg_name]
-                inputs.append(value)
-            output = self._graph.call_ast(static_function, *inputs)
-            if output is not None:
-                self.stack.push(output)
-                self.RETURN_VALUE(None)
-                return (
-                    CustomCode(self.new_code, self.new_code is None),
-                    self.guard_fn,
-                )
-        self.run()
-        if self.new_code is self.empty_code:
-            raise InnerError("OpExecutor return a empty new_code.")
-        return (
-            CustomCode(self.new_code, self.new_code is None),
-            self.guard_fn,
-        )
-
-    def _gen_loop_body_between(
-        self, inputs: list, for_iter_idx: int, start: int, end: int
-    ) -> types.FunctionType:
-        """
-        Generates the loop body between the specified indices in the instruction list.
-
-        Args:
-            inputs: function inputs infos
-            for_iter_idx (int): For find the for_iter opcode
-            start (int): The start index of the loop body.
-            end (int): The end index of the loop body.
-
-        Returns:
-            tuple: The generated loop body function object and its inputs.
-
-        """
-        pycode_gen = PyCodeGen(self._frame)
-        origin_instrs = get_instructions(pycode_gen._origin_code)
-
-        for_iter = origin_instrs[for_iter_idx]
-
-        # for balance the stack (the loop body will pop iter first before break or return)
-        # this None is used for replace the iterator obj in stack top
-        pycode_gen.gen_load_const(None)
-
-        # extend loop body main logic
-        pycode_gen.extend_instrs(origin_instrs[start:end])
-
-        # break should jump to this nop
-        nop_for_break = pycode_gen._add_instr("NOP")
-
-        # need do additional operates when break
-        pycode_gen.gen_load_const(False)
-        pycode_gen.gen_store_fast(inputs[-1])
-        pycode_gen.gen_load_const(None)  # keep stack balance
-
-        # continue should jump to this nop
-        nop_for_continue = pycode_gen._add_instr("NOP")
-        pycode_gen.gen_pop_top()
-
-        # relocate jump
-        out_loop = for_iter.jump_to
-        for instr in pycode_gen._instructions:
-            if instr.jump_to == for_iter:
-                instr.jump_to = nop_for_continue
-            if instr.jump_to == out_loop:
-                instr.jump_to = nop_for_break
-
-        # outputs is the same as inputs
-        pycode_gen.gen_outputs_and_return(inputs)
-        return pycode_gen.create_fn_with_inputs(inputs)
-
     @fallback_when_occur_error
     def _break_graph_when_for_loop(
         self, iterator: VariableBase, for_iter: Instruction
     ):
-        '''
-        for_iter: the FOR_ITER opcode
-
-        need find out opcodes which unpack value from FOR_ITER, by analysing stack
-
-        case 1:
-            for i in iter:
-
-            FOR_ITER
-            STORE_FAST i
-
-        case 2:
-            for i,j in iter:
-
-            FOR_ITER
-            UNPACK_SEQUENCE 2
-            STORE_FAST i
-            STORE_FAST j
-
-        TODO: check var is in globals or builtins, only locals considered now
-        '''
-        # 0. prepare sub functions
-        # 0.1 find the range of loop body
+        # 1. find the range of loop body
         assert for_iter.jump_to is not None
-        loop_body_start_idx = self.indexof(for_iter) + 1
+        for_iter_idx = self.indexof(for_iter)
+        loop_body_start_idx = for_iter_idx + 1
         loop_body_end_idx = self.indexof(for_iter.jump_to)
         curent_stack = 1
 
@@ -1877,122 +1927,170 @@ def _break_graph_when_for_loop(
             if curent_stack == 0:
                 break
 
-        # 0.2 create loop body function
-        all_used_vars = analysis_used_names_with_space(
+        # 2. create loop body function
+        loop_body_read_names, loop_body_write_names = analysis_used_names(
             self._instructions, loop_body_start_idx, loop_body_end_idx
         )
-        loop_body_inputs = [
-            k
-            for k, v in all_used_vars.items()
-            if v in (Space.locals, Space.cells)
-        ] + ["_break_flag"]
-
-        loop_body_fn = self._gen_loop_body_between(
-            loop_body_inputs,
-            self.indexof(for_iter),
-            loop_body_start_idx,
-            loop_body_end_idx,
-        )
+        loop_body_inputs = self._find_names_in_space(
+            loop_body_read_names | loop_body_write_names,
+            (Space.locals, Space.cells),
+        ) + ["_break_flag"]
+        loop_body_outputs = list(loop_body_write_names) + ["_break_flag"]
 
-        log(3, "[Resumed Function]: break graph in loop create loop body as\n")
-        log_do(3, lambda: dis.dis(loop_body_fn))
+        def create_loop_body():
+            pycode_gen = PyCodeGen(self._frame)
 
-        # 0.3 create after loop part function, minus 1 for iterator
-        after_loop_fn, fn_inputs = self._create_resume_fn(
-            loop_body_end_idx, len(self.stack) - 1
-        )
+            pycode_gen.set_function_inputs(loop_body_inputs, stack_size=0)
 
-        total_inputs = OrderedSet(list(fn_inputs) + list(loop_body_inputs[:-1]))
+            origin_instrs = get_instructions(pycode_gen._origin_code)
+            for_iter = origin_instrs[for_iter_idx]
 
-        # 1. part before for-loop, start compile
-        ret_names = [
-            name
-            for name in total_inputs
-            if name in chain(self._locals, self._cells)
-        ]
+            # for balance the stack (the loop body will pop iter first before break or return)
+            # this None is used for replace the iterator obj in stack top
+            pycode_gen.gen_load_const(None)
+
+            # extend loop body main logic
+            pycode_gen.extend_instrs(
+                origin_instrs[loop_body_start_idx:loop_body_end_idx]
+            )
+
+            # break should jump to this nop
+            nop_for_break = pycode_gen.add_instr("NOP")
+
+            # need do additional operates when break
+            pycode_gen.gen_load_const(False)
+            pycode_gen.gen_store_fast(loop_body_inputs[-1])
+            pycode_gen.gen_load_const(None)  # keep stack balance
+
+            # continue should jump to this nop
+            nop_for_continue = pycode_gen.add_instr("NOP")
+            pycode_gen.gen_pop_top()
+
+            # relocate jump
+            out_loop = for_iter.jump_to
+            for instr in pycode_gen._instructions:
+                if instr.jump_to == for_iter:
+                    instr.jump_to = nop_for_continue
+                if instr.jump_to == out_loop:
+                    instr.jump_to = nop_for_break
+
+            # outputs is the same as inputs
+            pycode_gen.set_function_outputs(loop_body_outputs)
+            loop_body_fn = pycode_gen.create_function()
+
+            log(
+                3,
+                "[Resumed Function]: break graph in loop create loop body as\n",
+            )
+            log_do(3, lambda: dis.dis(loop_body_fn))
 
-        var_loader = self.gen_compute_in_break_with_name_store(
-            ret_names, self.indexof(for_iter)
+            return loop_body_fn
+
+        loop_body_fn = create_loop_body()
+
+        # 3. create after loop part function, stack size minus 1 for iterator
+        after_loop_read_names, _ = analysis_used_names(
+            self._instructions, loop_body_end_idx, len(self._instructions)
+        )
+        after_loop_fn_inputs = self._find_names_in_space(
+            after_loop_read_names, (Space.locals, Space.cells)
         )
 
-        # 2. restore vars with origin name
-        for name in ret_names:
-            var_loader.load(self.get_var(name))
-            self._graph.pycode_gen.gen_store(name, self._code)
+        def create_after_loop_fn():
+            if self._instructions[loop_body_end_idx].opname == "RETURN_VALUE":
+                return None
+            pycode_gen = PyCodeGen(self._frame)
+            origin_instrs = get_instructions(pycode_gen._origin_code)
+            pycode_gen.set_function_inputs(
+                after_loop_fn_inputs, stack_size=len(self.stack) - 1
+            )
+            pycode_gen.extend_instrs(origin_instrs[loop_body_end_idx:])
+            # the resume_fn contains return code, so we don't need set output here
+            # global vars are updated correctly, and need local vars will return
+            after_loop_fn = pycode_gen.create_function()
+            return after_loop_fn
 
-        # 3. setup vars which is created in loop as Undefind
-        undefined_names = set()
+        after_loop_fn = create_after_loop_fn()
+
+        # 4. setup vars which is created in loop as Undefind
         for name in loop_body_inputs[:-1]:
-            if not self.has_var(name, all_used_vars[name]):
-                undefined_names.add(name)
+            if not self.has_var(name):
+                self._graph.pycode_gen.gen_load_const(SotUndefinedVar())
+                self._graph.pycode_gen.gen_store(name, self._code)
+        for name in after_loop_fn_inputs:
+            if not self.has_var(name):
                 self._graph.pycode_gen.gen_load_const(SotUndefinedVar())
                 self._graph.pycode_gen.gen_store(name, self._code)
 
-        # 4.1 load iterator
+        # 5. compile sub graph before for-loop
+        update_names = list(loop_body_read_names | after_loop_read_names)
+        var_loader = self.get_compute_fn_and_update_changed_vars(
+            update_names, self.stack, self.indexof(for_iter)
+        )
+
+        # 6. prepare a new loop and call loop body
+        # 6.1. load iterator, it is in stack, so we can load it with var_loader
         var_loader.load(iterator)
         self.stack.pop()
 
-        # 4.2 gen FOR_ITER and unpack data
+        # 6.2. copy FOR_ITER and unpack logic
         self._graph.pycode_gen.extend_instrs(
-            self._instructions[self.indexof(for_iter) : loop_body_start_idx]
+            self._instructions[for_iter_idx:loop_body_start_idx]
         )
 
-        # 5. call loop body
-        # 5.1 load loop body
+        # 6.3 load loop body, prepare inputs and call
         self._graph.pycode_gen.gen_load_object(
             loop_body_fn, loop_body_fn.__code__.co_name
         )
 
-        # 5.2 load loop body inputs
         for name in loop_body_inputs[:-1]:
             self._graph.pycode_gen.gen_load(name)
 
-        # 5.3 load break flag
+        # this is the _break_flag
         self._graph.pycode_gen.gen_load_const(True)
 
-        # 5.4 call loop body
         self._graph.pycode_gen.gen_call_function(
             argc=loop_body_fn.__code__.co_argcount
         )
 
-        # 5.5 unpack and store retval, keep break_flag in stack
-        self._graph.pycode_gen.gen_unpack_sequence(len(loop_body_inputs))
+        # 7. unpack and update changed vars, keep break_flag in stack
+        self._graph.pycode_gen.gen_unpack_sequence(len(loop_body_outputs))
 
-        for name in loop_body_inputs[:-1]:
+        for name in loop_body_outputs[:-1]:
             self._graph.pycode_gen.gen_store(name, self._code)
 
-        # 6. add jump if break
+        # 8. create the tail of a for loop, jump back to FOR_ITER
+        #    and process case if break
         jump_if_break = self._graph.pycode_gen.gen_pop_jump(
             direction=JumpDirection.FORWARD, suffix=PopJumpCond.FALSE
         )
 
-        # 7. jump back to FOR_ITER
         self._graph.pycode_gen.gen_jump(
             for_iter, direction=JumpDirection.BACKWARD
         )
-        nop = self._graph.pycode_gen._add_instr("NOP")
+        nop = self._graph.pycode_gen.add_instr("NOP")
         for_iter.jump_to = nop
         jump_if_break.jump_to = nop
 
-        # 8. call after_loop_fn
-        self._graph.pycode_gen.gen_load_object(
-            after_loop_fn, after_loop_fn.__code__.co_name
-        )
+        # 9. prepare inputs and call after_loop_fn
+        if after_loop_fn is not None:
+            self._graph.pycode_gen.gen_load_object(
+                after_loop_fn, after_loop_fn.__code__.co_name
+            )
 
-        for stack_arg in self.stack:
-            var_loader.load(stack_arg)
-        for name in fn_inputs:
-            if not self.has_var(name) and name not in undefined_names:
-                undefined_names.add(name)
-                self._graph.pycode_gen.gen_load_const(SotUndefinedVar())
-                self._graph.pycode_gen.gen_store(name, self._code)
-            self._graph.pycode_gen.gen_load(name)
+            for stack_arg in self.stack:
+                var_loader.load(stack_arg)
 
-        self._graph.pycode_gen.gen_call_function(
-            argc=after_loop_fn.__code__.co_argcount
-        )
+            for name in after_loop_fn_inputs:
+                self._graph.pycode_gen.gen_load(name)
+
+            self._graph.pycode_gen.gen_call_function(
+                argc=after_loop_fn.__code__.co_argcount
+            )
 
+        # return what after_loop_fn return
         self._graph.pycode_gen.gen_return()
+
         self.new_code = self._graph.pycode_gen.gen_pycode()
         self.guard_fn = self._graph.guard_fn
 
@@ -2000,135 +2098,95 @@ def _inline_call_for_loop(
         self, iterator: VariableBase, for_iter: Instruction
     ):
         assert for_iter.jump_to is not None
-        pycode_gen = PyCodeGen(self._frame)
-        origin_instrs = get_instructions(pycode_gen._origin_code)
 
+        # 1. analyse input and output
         start_idx = self.indexof(for_iter)
         end_idx = self.indexof(for_iter.jump_to)
 
-        all_used_vars = analysis_used_names_with_space(
-            origin_instrs, start_idx, end_idx
+        read_names, write_names = analysis_used_names(
+            self._instructions, start_idx, end_idx
         )
 
-        inputs = [
-            k
-            for k, v in all_used_vars.items()
-            if v in (Space.locals, Space.cells)
-        ] + [iterator.id]
+        # why add write_names as input? check case in test/sot/test_12_for_loop.py
+        # test_for_without_zero_iter
+        input_var_names = self._find_names_in_space(
+            read_names | write_names, (Space.locals, Space.cells)
+        ) + [iterator.id]
+        output_var_names = list(write_names) + [iterator.id]
 
-        # 1. load iter
-        pycode_gen.gen_load_fast(iterator.id)
+        # 2. create inline call loop fn
+        def create_inline_call_fn():
+            pycode_gen = PyCodeGen(self._frame)
+            origin_instrs = get_instructions(pycode_gen._origin_code)
 
-        # 2. copy main logic
-        pycode_gen.extend_instrs(origin_instrs[start_idx:end_idx])
+            pycode_gen.set_function_inputs(input_var_names, stack_size=0)
 
-        # 3. add break, continue marker and relocate jump
-        for_iter_instr = origin_instrs[start_idx]
-        assert for_iter_instr.jump_to is not None
-        out_loop_instr = for_iter_instr.jump_to
+            # 2.1. load iter, it is a input of loop fn
+            pycode_gen.gen_load_fast(iterator.id)
 
-        pycode_gen.gen_jump(out_loop_instr, direction=JumpDirection.FORWARD)
-        nop_for_continue = pycode_gen._add_instr("NOP")
+            # 2.2. copy main logic
+            pycode_gen.extend_instrs(origin_instrs[start_idx:end_idx])
 
-        jump = pycode_gen.gen_jump(
-            for_iter_instr, direction=JumpDirection.BACKWARD
-        )
+            # 2.3. add break, continue marker and relocate jump
+            for_iter_instr = origin_instrs[start_idx]
+            assert for_iter_instr.jump_to is not None
+            out_loop_instr = for_iter_instr.jump_to
 
-        nop_for_break = pycode_gen._add_instr("NOP")
+            pycode_gen.gen_jump(out_loop_instr, direction=JumpDirection.FORWARD)
+            nop_for_continue = pycode_gen.add_instr("NOP")
 
-        for instr in pycode_gen._instructions:
-            if instr.jump_to == for_iter_instr:
-                instr.jump_to = nop_for_continue
+            jump = pycode_gen.gen_jump(
+                for_iter_instr, direction=JumpDirection.BACKWARD
+            )
 
-            if (
-                instr.jump_to in origin_instrs
-                and origin_instrs.index(instr.jump_to) >= end_idx
-            ):
-                instr.jump_to = nop_for_break
+            nop_for_break = pycode_gen.add_instr("NOP")
 
-        jump.jump_to = for_iter_instr
-        pycode_gen.gen_outputs_and_return(inputs)
-        inline_call_fn = pycode_gen.create_fn_with_inputs(inputs)
+            # 2.4. relocate jumps
+            for instr in pycode_gen._instructions:
+                if instr.jump_to == for_iter_instr:
+                    instr.jump_to = nop_for_continue
 
-        log(
-            3,
-            f"[Resumed Function]: Inline call for loop function {inline_call_fn.__code__.co_name}\n",
-        )
-        log_do(3, lambda: dis.dis(inline_call_fn))
+                if (
+                    instr.jump_to in origin_instrs
+                    and origin_instrs.index(instr.jump_to) >= end_idx
+                ):
+                    instr.jump_to = nop_for_break
+
+            jump.jump_to = for_iter_instr
+
+            pycode_gen.set_function_outputs(output_var_names)
+            inline_call_fn = pycode_gen.create_function()
 
-        # TODO: update globals builtins
+            log(
+                3,
+                f"[Resumed Function]: Inline call for loop function {inline_call_fn.__code__.co_name}\n",
+            )
+            log_do(3, lambda: dis.dis(inline_call_fn))
+
+            return inline_call_fn
+
+        inline_call_fn = create_inline_call_fn()
+
+        # 3. create function variable
         fn = UserDefinedFunctionVariable(
             inline_call_fn,
             self._graph,
             DanglingTracker(),
         )
 
+        # 4. prepare input datas and call
         input_vars = [
-            self.get_var(name)
-            if self.has_var(name, all_used_vars[name])
-            else SotUndefinedVar()
-            for name in inputs[:-1]
+            self.get_var(name, allow_undefined=True)
+            for name in input_var_names[:-1]
         ] + [iterator]
+
         ret = fn(*input_vars)
-        # slice_variable is [:-1]
+
+        # 5. update changed vars
         slice_const = slice(None, -1, None)
         slice_variable = SliceVariable(
             slice_const, self._graph, ConstTracker(slice_const)
         )
-        for name, val in zip(inputs[:-1], ret[slice_variable]):
-            self._locals[name] = val
-
-    def FOR_ITER(self, instr):
-        iterator = self.stack.pop()
-        backup_iter_idx = None
-
-        start = self.indexof(instr)
-        end = self.indexof(instr.jump_to)
-        for i in range(start, end):
-            if self._instructions[i].opname == "RETURN_VALUE":
-                raise FallbackError("Found RETURN_VALUE in for loop body.")
-
-        self._graph.add_global_guarded_variable(iterator)
-
-        try:
-            if not isinstance(iterator, SequenceIterVariable):
-                raise BreakGraphError(
-                    f"Can not simulate iterator of {type(iterator)}."
-                )
-
-            backup_iter_idx = iterator.idx
-
-            self._inline_call_for_loop(iterator, instr)
-            self._lasti = self.indexof(instr.jump_to)
-            next_instr = self._instructions[self._lasti]
-            self._lasti += int(next_instr.opname == 'END_FOR')
-        except BreakGraphError as e:
-            log(3, f"[BreakGraph] FOR_ITER sim for loop failed for: {e}\n")
-            if backup_iter_idx:
-                iterator.idx = backup_iter_idx
-            self._graph.remove_global_guarded_variable(iterator)
-            self.stack.push(iterator)
-            self._break_graph_when_for_loop(iterator, instr)
-            return Stop(state="BreakGraph")
-
-    def RETURN_VALUE(self, instr: Instruction):
-        assert (
-            len(self.stack) == 1
-        ), f"Stack must have one element, but get {len(self.stack)} elements."
-        ret_val = self.stack.pop()
-        return self.compile_return(ret_val)
-
-    def RETURN_CONST(self, instr: Instruction):
-        ret_const = self._co_consts[instr.arg]
-        return self.compile_return(ret_const)
 
-    def compile_return(self, ret_val):
-        compile_fn = self._graph.get_compiled_fn(ret_val)
-        if compile_fn.graph_size() < ENV_MIN_GRAPH_SIZE.get():
-            self.new_code = None
-        else:
-            self._graph.start_compile(ret_val)
-            self._graph.pycode_gen.gen_return()
-            self.new_code = self._graph.pycode_gen.gen_pycode()
-        self.guard_fn = self._graph.guard_fn
-        return Stop(state="Return")
+        for name, var in zip(output_var_names[:-1], ret[slice_variable]):
+            self.set_var(name, var)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py
index 3832d05f04448..306166aa7d872 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_inline_executor.py
@@ -299,16 +299,6 @@ def _break_graph_when_if(self, result, instr: Instruction):
             "OpcodeInlineExecutor want break graph when simulate `if`."
         )
 
-    def _create_resume_fn(self, index: int, stack_size: int = 0):
-        """
-        Helper method to create a resume function for the executor.
-
-        Args:
-            index (int): The index of the instruction to resume execution from.
-            stack_size (int, optional): The size of the stack. Defaults to 0.
-        """
-        raise BreakGraphError("_create_resume_fn.")
-
     def FOR_ITER(self, instr: Instruction):
         iterator = self.stack.top
         assert isinstance(iterator, IterVariable)
diff --git a/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py b/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py
index 69e174818d662..2ada3f7228f11 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/pycode_generator.py
@@ -39,11 +39,9 @@
     no_eval_frame,
 )
 from ..instruction_utils import (
-    analysis_inputs,
     apply_instr_pass,
     calc_stack_effect,
     gen_instr,
-    get_instructions,
     instrs_info,
     modify_instrs,
     modify_vars,
@@ -437,6 +435,42 @@ def __init__(
         self.hooks = []
         if self.disable_eval_frame:
             self.gen_disable_eval_frame()
+        self.fn_name = ResumeFnNameFactory().next()
+
+    def set_function_inputs(self, inputs: list[str], stack_size: int):
+        stack_arg_str = self.fn_name + '_stack_{}'
+
+        self._code_options['co_argcount'] = len(inputs) + stack_size
+        self._code_options['co_varnames'] = list(
+            [stack_arg_str.format(i) for i in range(stack_size)]
+            + inputs
+            + [
+                var_name
+                for var_name in self._origin_code.co_varnames
+                if var_name not in inputs
+            ]
+        )
+
+        self._instructions.extend(
+            [
+                gen_instr('LOAD_FAST', argval=stack_arg_str.format(i))
+                for i in range(stack_size)
+            ]
+        )
+
+    def set_function_outputs(self, outputs: list[str]):
+        for name in outputs:
+            self.gen_load(name)
+        self.gen_build_tuple(len(outputs))
+        self.gen_return()
+
+    def create_function(self) -> types.FunctionType:
+        self.update_code_name(self.fn_name, is_resumed_fn=True)
+        new_code = self.gen_pycode()
+        if len(new_code.co_freevars) + len(new_code.co_cellvars) > 0:
+            raise FallbackError("Break graph in closure is not support.")
+        fn = types.FunctionType(new_code, self._f_globals, new_code.co_name)
+        return fn
 
     def insert_prefix_instructions(self):
         """
@@ -509,58 +543,6 @@ def gen_pycode(self) -> types.CodeType:
 
         return new_code
 
-    def gen_resume_fn_at(
-        self, index: int, stack_size: int
-    ) -> tuple[None | types.FunctionType, OrderedSet[str]]:
-        """
-        Generates a resume function at the specified index in the instruction list.
-
-        Args:
-            index (int): The index in the instruction list to generate the resume function.
-            stack_size (int): The size of the stack. Defaults to 0.
-
-        Returns:
-            tuple: The resume function object and the inputs to the function.
-
-        """
-
-        self._instructions = get_instructions(self._origin_code)
-        # TODO(dev): could give an example code here?
-        if self._instructions[index].opname == 'RETURN_VALUE':
-            return None, OrderedSet()
-        inputs = analysis_inputs(self._instructions, index)
-        fn_name = ResumeFnNameFactory().next()
-        stack_arg_str = fn_name + '_stack_{}'
-
-        self._instructions = (
-            [
-                gen_instr('LOAD_FAST', argval=stack_arg_str.format(i))
-                for i in range(stack_size)
-            ]
-            + [gen_instr('JUMP_FORWARD', jump_to=self._instructions[index])]
-            + self._instructions
-        )
-
-        self._code_options['co_argcount'] = len(inputs) + stack_size
-        # inputs should be at the front of the co_varnames
-        self._code_options['co_varnames'] = list(
-            [stack_arg_str.format(i) for i in range(stack_size)]
-            + list(inputs)
-            + [
-                var_name
-                for var_name in self._code_options['co_varnames']
-                if var_name not in inputs
-            ]
-        )
-
-        self.update_code_name(fn_name, is_resumed_fn=True)
-        new_code = self.gen_pycode()
-        if len(new_code.co_freevars) + len(new_code.co_cellvars) > 0:
-            raise FallbackError("Break graph in closure is not support.")
-        fn = types.FunctionType(new_code, self._f_globals, new_code.co_name)
-
-        return fn, inputs
-
     @cached_property
     def global_null_variable(self):
         from .variables.basic import NullVariable
@@ -593,39 +575,6 @@ def gen_enable_eval_frame(self):
         self.gen_call_function(1)
         self.gen_pop_top()
 
-    def gen_outputs_and_return(self, outputs):
-        for name in outputs:
-            self.gen_load(name)
-        self.gen_build_tuple(len(outputs))
-        self.gen_return()
-
-    def create_fn_with_inputs(self, inputs: list) -> types.FunctionType:
-        """
-        Creates a function with specific input and output variables.
-
-        Args:
-            inputs (list): The input variables.
-
-        Returns:
-            function: The created function object.
-        """
-        self._code_options['co_argcount'] = len(inputs)
-        self._code_options['co_varnames'] = list(
-            list(inputs)
-            + [
-                var_name
-                for var_name in self._origin_code.co_varnames
-                if var_name not in inputs
-            ]
-        )
-        fn_name = ResumeFnNameFactory().next()
-        self.update_code_name(fn_name, is_resumed_fn=True)
-        new_code = self.gen_pycode()
-        if len(new_code.co_freevars) + len(new_code.co_cellvars) > 0:
-            raise FallbackError("Break graph in closure is not support.")
-        fn = types.FunctionType(new_code, self._f_globals, new_code.co_name)
-        return fn
-
     def gen_load_const(self, value: Any):
         """
         Generates instructions to load a constant value.
@@ -636,7 +585,7 @@ def gen_load_const(self, value: Any):
         if not list_contain_by_id(self._code_options["co_consts"], value):
             self._code_options["co_consts"].append(value)
         idx = list_find_index_by_id(self._code_options["co_consts"], value)
-        self._add_instr("LOAD_CONST", arg=idx, argval=value)
+        return self.add_instr("LOAD_CONST", arg=idx, argval=value)
 
     def gen_print_log(self, message):
         """print a log"""
@@ -745,7 +694,7 @@ def gen_load_global(self, name, push_null=False):
             idx <<= 1
             if push_null:
                 idx |= 1
-        self._add_instr("LOAD_GLOBAL", arg=idx, argval=name)
+        return self.add_instr("LOAD_GLOBAL", arg=idx, argval=name)
 
     def gen_load_object(self, obj, obj_name: str, push_null: bool = True):
         """
@@ -758,14 +707,14 @@ def gen_load_object(self, obj, obj_name: str, push_null: bool = True):
 
         if obj_name not in self._f_globals:
             self._f_globals[obj_name] = obj
-        self.gen_load_global(obj_name, push_null=push_null)
+        return self.gen_load_global(obj_name, push_null=push_null)
 
     def gen_load_null_variable(self):
         """
         Generate the bytecode for loading a null variable.
         """
         null_var = self.global_null_variable
-        self.gen_load_object(null_var, "___null_var", push_null=False)
+        return self.gen_load_object(null_var, "___null_var", push_null=False)
 
     def gen_load_fast(self, name):
         """
@@ -777,7 +726,7 @@ def gen_load_fast(self, name):
         if name not in self._code_options["co_varnames"]:
             self._code_options["co_varnames"].append(name)
         idx = self._code_options["co_varnames"].index(name)
-        self._add_instr("LOAD_FAST", arg=idx, argval=name)
+        return self.add_instr("LOAD_FAST", arg=idx, argval=name)
 
     def gen_load_deref(self, name):
         if name not in self.cell_free_storage:
@@ -791,7 +740,7 @@ def gen_load_deref(self, name):
             ).index(name)
         else:
             idx = self.cell_free_storage.index(name)
-        self._add_instr("LOAD_DEREF", arg=idx, argval=name)
+        return self.add_instr("LOAD_DEREF", arg=idx, argval=name)
 
     def gen_load_attr(self, name: str):
         if name not in self._code_options["co_names"]:
@@ -799,49 +748,49 @@ def gen_load_attr(self, name: str):
         idx = self._code_options["co_names"].index(name)
         if sys.version_info >= (3, 12):
             idx <<= 1
-        self._add_instr("LOAD_ATTR", arg=idx, argval=name)
+        return self.add_instr("LOAD_ATTR", arg=idx, argval=name)
 
     def gen_store_attr(self, name: str):
         if name not in self._code_options["co_names"]:
             self._code_options["co_names"].append(name)
         idx = self._code_options["co_names"].index(name)
-        self._add_instr("STORE_ATTR", arg=idx, argval=name)
+        return self.add_instr("STORE_ATTR", arg=idx, argval=name)
 
     def gen_delete_attr(self, name: str):
         if name not in self._code_options["co_names"]:
             self._code_options["co_names"].append(name)
         idx = self._code_options["co_names"].index(name)
-        self._add_instr("DELETE_ATTR", arg=idx, argval=name)
+        return self.add_instr("DELETE_ATTR", arg=idx, argval=name)
 
     def gen_load_method(self, name: str):
         if name not in self._code_options["co_names"]:
             self._code_options["co_names"].append(name)
         idx = self._code_options["co_names"].index(name)
-        self._add_instr("LOAD_METHOD", arg=idx, argval=name)
+        return self.add_instr("LOAD_METHOD", arg=idx, argval=name)
 
     def gen_delete_global(self, name: str):
         if name not in self._code_options["co_names"]:
             self._code_options["co_names"].append(name)
         idx = self._code_options["co_names"].index(name)
-        self._add_instr("DELETE_GLOBAL", arg=idx, argval=name)
+        return self.add_instr("DELETE_GLOBAL", arg=idx, argval=name)
 
     def gen_import_name(self, name: str):
         if name not in self._code_options["co_names"]:
             self._code_options["co_names"].append(name)
         idx = self._code_options["co_names"].index(name)
-        self._add_instr("IMPORT_NAME", arg=idx, argval=name)
+        return self.add_instr("IMPORT_NAME", arg=idx, argval=name)
 
     def gen_store_fast(self, name):
         if name not in self._code_options["co_varnames"]:
             self._code_options["co_varnames"].append(name)
         idx = self._code_options["co_varnames"].index(name)
-        self._add_instr("STORE_FAST", arg=idx, argval=name)
+        return self.add_instr("STORE_FAST", arg=idx, argval=name)
 
     def gen_store_global(self, name):
         if name not in self._code_options["co_names"]:
             self._code_options["co_names"].append(name)
         idx = self._code_options["co_names"].index(name)
-        self._add_instr("STORE_GLOBAL", arg=idx, argval=name)
+        return self.add_instr("STORE_GLOBAL", arg=idx, argval=name)
 
     def gen_store_deref(self, name):
         if name not in self.cell_free_storage:
@@ -855,50 +804,50 @@ def gen_store_deref(self, name):
             ).index(name)
         else:
             idx = self.cell_free_storage.index(name)
-        self._add_instr("STORE_DEREF", arg=idx, argval=name)
+        return self.add_instr("STORE_DEREF", arg=idx, argval=name)
 
     def gen_store_subscr(self):
-        self._add_instr("STORE_SUBSCR")
+        return self.add_instr("STORE_SUBSCR")
 
     def gen_subscribe(self):
-        self._add_instr("BINARY_SUBSCR")
+        return self.add_instr("BINARY_SUBSCR")
 
     def gen_build_tuple(self, count):
-        self._add_instr("BUILD_TUPLE", arg=count, argval=count)
+        return self.add_instr("BUILD_TUPLE", arg=count, argval=count)
 
     def gen_build_list(self, count):
-        self._add_instr("BUILD_LIST", arg=count, argval=count)
+        return self.add_instr("BUILD_LIST", arg=count, argval=count)
 
     def gen_build_map(self, count):
-        self._add_instr("BUILD_MAP", arg=count, argval=count)
+        return self.add_instr("BUILD_MAP", arg=count, argval=count)
 
     def gen_build_slice(self, argc):
-        self._add_instr("BUILD_SLICE", arg=argc, argval=argc)
+        return self.add_instr("BUILD_SLICE", arg=argc, argval=argc)
 
     def gen_unpack_sequence(self, count):
-        self._add_instr("UNPACK_SEQUENCE", arg=count, argval=count)
+        return self.add_instr("UNPACK_SEQUENCE", arg=count, argval=count)
 
     def gen_call_function(self, argc=0):
         if sys.version_info >= (3, 11):
             if sys.version_info < (3, 12):
-                self._add_instr("PRECALL", arg=argc, argval=argc)
-            self._add_instr("CALL", arg=argc, argval=argc)
+                self.add_instr("PRECALL", arg=argc, argval=argc)
+            self.add_instr("CALL", arg=argc, argval=argc)
         else:
-            self._add_instr("CALL_FUNCTION", arg=argc, argval=argc)
+            self.add_instr("CALL_FUNCTION", arg=argc, argval=argc)
 
     def gen_call_function_ex(self, has_kwargs):
         flag = 0
         if has_kwargs:
             flag |= CALL_FUNCTION_EX_FLAG.CFE_HAS_KWARGS
-        self._add_instr("CALL_FUNCTION_EX", arg=flag, argval=flag)
+        self.add_instr("CALL_FUNCTION_EX", arg=flag, argval=flag)
 
     def gen_call_method(self, argc=0):
         if sys.version_info >= (3, 11):
             if sys.version_info < (3, 12):
-                self._add_instr("PRECALL", arg=argc, argval=argc)
-            self._add_instr("CALL", arg=argc, argval=argc)
+                self.add_instr("PRECALL", arg=argc, argval=argc)
+            self.add_instr("CALL", arg=argc, argval=argc)
         else:
-            self._add_instr("CALL_METHOD", arg=argc, argval=argc)
+            self.add_instr("CALL_METHOD", arg=argc, argval=argc)
 
     def gen_kw_names(self, kw_names: tuple[str, ...] | None):
         if kw_names is None:
@@ -908,22 +857,22 @@ def gen_kw_names(self, kw_names: tuple[str, ...] | None):
         if kw_names not in self._code_options["co_consts"]:
             self._code_options["co_consts"].append(kw_names)
         idx = self._code_options["co_consts"].index(kw_names)
-        self._add_instr("KW_NAMES", arg=idx, argval=kw_names)
+        self.add_instr("KW_NAMES", arg=idx, argval=kw_names)
 
     def gen_pop_top(self):
-        self._add_instr("POP_TOP")
+        return self.add_instr("POP_TOP")
 
     def gen_rot_n(self, n):
         if n <= 1:
             return
         if sys.version_info >= (3, 11):
             for i in range(n, 1, -1):
-                self._add_instr("SWAP", arg=i)
+                self.add_instr("SWAP", arg=i)
         elif sys.version_info >= (3, 10):
-            self._add_instr("ROT_N", arg=n)
+            self.add_instr("ROT_N", arg=n)
         else:
             if n <= 4:
-                self._add_instr("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])
+                self.add_instr("ROT_" + ["TWO", "THREE", "FOUR"][n - 2])
             else:
 
                 def rot_n_fn(n):
@@ -937,7 +886,7 @@ def rot_n_fn(n):
                 self.gen_build_tuple(n)
                 self.gen_load_const(rot_n_fn(n))
                 self.gen_rot_n(2)
-                self._add_instr("CALL_FUNCTION_EX", arg=0)
+                self.add_instr("CALL_FUNCTION_EX", arg=0)
                 self.gen_unpack_sequence(n)
 
     def gen_shift_n(self, s: int, n: int):
@@ -970,7 +919,7 @@ def gen_shift_n(self, s: int, n: int):
                 # NOTE: s=-1, n=3 [1,2,3,4,5] -> [1,2,4,5,3]
                 if s == -1:
                     for i in range(2, n + 1):
-                        self._add_instr("SWAP", arg=i)
+                        self.add_instr("SWAP", arg=i)
                 else:
                     self.gen_shift_n(-1, n)
                     self.gen_shift_n(s + 1, n)
@@ -981,7 +930,7 @@ def gen_shift_n(self, s: int, n: int):
 
     def gen_swap(self, n):
         if sys.version_info >= (3, 11):
-            self._add_instr("SWAP", arg=n)
+            self.add_instr("SWAP", arg=n)
         else:
             raise NotImplementedError("swap is not supported before python3.11")
 
@@ -992,9 +941,9 @@ def gen_jump(
         direction: JumpDirection = JumpDirection.FORWARD,
     ) -> Instruction:
         if sys.version_info >= (3, 11):
-            return self._add_instr(f"JUMP_{direction.value}", jump_to=jump_to)
+            return self.add_instr(f"JUMP_{direction.value}", jump_to=jump_to)
         else:
-            return self._add_instr("JUMP_ABSOLUTE", jump_to=jump_to)
+            return self.add_instr("JUMP_ABSOLUTE", jump_to=jump_to)
 
     def gen_pop_jump(
         self,
@@ -1004,33 +953,33 @@ def gen_pop_jump(
         suffix: PopJumpCond = PopJumpCond.NONE,
     ) -> Instruction:
         if sys.version_info >= (3, 11):
-            return self._add_instr(
+            return self.add_instr(
                 f"POP_JUMP_{direction.value}_IF_{suffix.value}", jump_to=jump_to
             )
         else:
-            return self._add_instr(
+            return self.add_instr(
                 f"POP_JUMP_IF_{suffix.value}", jump_to=jump_to
             )
 
     def gen_return(self):
-        self._add_instr("RETURN_VALUE")
+        return self.add_instr("RETURN_VALUE")
 
     def gen_get_iter(self):
-        self._add_instr("GET_ITER")
+        return self.add_instr("GET_ITER")
 
     def gen_operator_only(self, op_name):
         """
         only generator operator instruction, do nothing for
         operands.
         """
-        self._add_instr(op_name)
+        return self.add_instr(op_name)
 
     def gen_operator(self, op_name):
         """
         only generator operator instruction, do nothing for
         operands.
         """
-        self._add_instr(op_name)
+        return self.add_instr(op_name)
 
     def gen_compare(self, cmp_op):
         """
@@ -1039,9 +988,9 @@ def gen_compare(self, cmp_op):
         """
         if sys.version_info >= (3, 12):
             cmp_op <<= 4
-        self._add_instr("COMPARE_OP", cmp_op)
+        return self.add_instr("COMPARE_OP", cmp_op)
 
-    def _add_instr(self, *args, **kwargs):
+    def add_instr(self, *args, **kwargs):
         instr = gen_instr(*args, **kwargs)
         self._instructions.append(instr)
         return instr
diff --git a/python/paddle/jit/sot/opcode_translator/executor/tracker.py b/python/paddle/jit/sot/opcode_translator/executor/tracker.py
index fd7168f4e5957..51d21a5572129 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/tracker.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/tracker.py
@@ -393,7 +393,7 @@ def __init__(self, iter_source: VariableBase):
 
     def gen_instructions(self, codegen: PyCodeGen):
         self.iter_source.tracker.gen_instructions(codegen)
-        codegen._add_instr("GET_ITER")
+        codegen.add_instr("GET_ITER")
 
     def trace_value_from_frame(self):
         iter_source_tracer = self.iter_source.tracker.trace_value_from_frame()
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/__init__.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/__init__.py
index 0b9429e078ec7..833fd3c207e88 100644
--- a/python/paddle/jit/sot/opcode_translator/instruction_utils/__init__.py
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/__init__.py
@@ -15,6 +15,7 @@
 from .instruction_pass import apply_instr_pass  # noqa: F401
 from .instruction_utils import (  # noqa: F401
     Instruction,
+    Space,
     calc_offset_from_bytecode_offset,
     calc_stack_effect,
     convert_instruction,
@@ -29,7 +30,5 @@
     reset_offset,
 )
 from .opcode_analysis import (  # noqa: F401
-    Space,
-    analysis_inputs,
-    analysis_used_names_with_space,
+    analysis_used_names,
 )
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_pass.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_pass.py
index 8725aa55c3213..5b0cc17fc808f 100644
--- a/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_pass.py
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_pass.py
@@ -90,6 +90,8 @@ def find_related_local_opcodes(instrs, code_options):
             if len(stack) > 0 and stack[-1] is not None:
                 opcode_pairs.append((stack[-1], instr))
             stack.pop()
+        elif "ROT" in instr.opname:
+            return []
         else:
             try:
                 pop_n, push_n = StackAnalyser().stack_effect(instr)
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_utils.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_utils.py
index 05e6dcfc91e7d..2965c8e6bc056 100644
--- a/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_utils.py
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/instruction_utils.py
@@ -17,6 +17,7 @@
 import dataclasses
 import dis
 import sys
+from enum import Enum
 from typing import TYPE_CHECKING, Any
 
 from ...utils import InnerError
@@ -410,3 +411,10 @@ def calc_stack_effect(instr: Instruction, *, jump: bool | None = None) -> int:
             assert instr.arg is not None
             return -instr.arg - 1
     return dis.stack_effect(instr.opcode, instr.arg, jump=jump)
+
+
+class Space(Enum):
+    locals = 1
+    globals = 2
+    cells = 3
+    not_found = 4
diff --git a/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py
index f0211167f4449..2e8ded5d2ac5e 100644
--- a/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py
+++ b/python/paddle/jit/sot/opcode_translator/instruction_utils/opcode_analysis.py
@@ -15,11 +15,9 @@
 from __future__ import annotations
 
 import dataclasses
-from enum import Enum
 
 from paddle.jit.utils import OrderedSet
 
-from ...utils import InnerError
 from .instruction_utils import Instruction
 from .opcode_info import ALL_JUMP, HAS_FREE, HAS_LOCAL, UNCONDITIONAL_JUMP
 
@@ -30,6 +28,11 @@ class State:
     writes: OrderedSet[str]
     visited: OrderedSet[int]
 
+    def __or__(self, other):
+        reads = self.reads | other.reads
+        writes = self.writes | other.writes
+        return State(reads, writes, OrderedSet())
+
 
 def is_read_opcode(opname):
     if opname in [
@@ -63,7 +66,7 @@ def is_write_opcode(opname):
     return False
 
 
-def analysis_inputs(
+def analysis_used_names(
     instructions: list[Instruction],
     current_instr_idx: int,
     stop_instr_idx: int | None = None,
@@ -97,7 +100,7 @@ def walk(state: State, start: int) -> OrderedSet[str]:
         end = len(instructions) if stop_instr_idx is None else stop_instr_idx
         for i in range(start, end):
             if i in state.visited:
-                return state.reads
+                return state
             state.visited.add(i)
 
             instr = instructions[i]
@@ -116,104 +119,12 @@ def walk(state: State, start: int) -> OrderedSet[str]:
                 not_jump_branch = (
                     fork(state, i, False, target_idx)
                     if instr.opname not in UNCONDITIONAL_JUMP
-                    else OrderedSet()
-                )
-                return jump_branch | not_jump_branch
-            elif instr.opname == "RETURN_VALUE":
-                return state.reads
-        return state.reads
-
-    return walk(root_state, current_instr_idx)
-
-
-@dataclasses.dataclass
-class SpaceState:
-    reads: dict[str, Space]
-    writes: dict[str, Space]
-    visited: OrderedSet[int]
-
-    def __or__(self, other):
-        reads = {}
-        reads.update(other.reads)
-        reads.update(self.reads)
-        writes = {}
-        writes.update(other.writes)
-        writes.update(self.writes)
-        return SpaceState(reads, writes, OrderedSet())
-
-
-class Space(Enum):
-    locals = 1
-    globals = 2
-    cells = 3
-    all = 4
-
-
-def get_space(opname: str):
-    if "FAST" in opname:
-        return Space.locals
-    elif "GLOBAL" in opname:
-        return Space.globals
-    elif "DEREF" in opname or "CLOSURE" in opname:
-        return Space.cells
-    elif "NAME" in opname:
-        return Space.all
-    else:
-        raise InnerError(f"Unknown space for {opname}")
-
-
-def analysis_used_names_with_space(
-    instructions: list[Instruction],
-    start_instr_idx: int,
-    stop_instr_idx: int | None = None,
-):
-    root_state = SpaceState({}, {}, OrderedSet())
-
-    def fork(
-        state: SpaceState, start: int, jump: bool, jump_target: int
-    ) -> SpaceState:
-        new_start = start + 1 if not jump else jump_target
-        new_state = SpaceState(
-            dict(state.reads),
-            dict(state.writes),
-            OrderedSet(state.visited),
-        )
-        return walk(new_state, new_start)
-
-    def walk(state: SpaceState, start: int) -> SpaceState:
-        end = len(instructions) if stop_instr_idx is None else stop_instr_idx
-        for i in range(start, end):
-            if i in state.visited:
-                return state
-            state.visited.add(i)
-
-            instr = instructions[i]
-            if instr.opname in HAS_LOCAL | HAS_FREE:
-                if is_read_opcode(instr.opname) and instr.argval not in (
-                    state.writes
-                ):
-                    space = get_space(instr.opname)
-                    state.reads[instr.argval] = space
-                elif is_write_opcode(instr.opname):
-                    space = get_space(instr.opname)
-                    state.writes[instr.argval] = space
-            elif instr.opname in ALL_JUMP:
-                assert instr.jump_to is not None
-                target_idx = instructions.index(instr.jump_to)
-                # Fork to two branches, jump or not
-                jump_branch = fork(state, i, True, target_idx)
-                not_jump_branch = (
-                    fork(state, i, False, target_idx)
-                    if instr.opname not in UNCONDITIONAL_JUMP
-                    else SpaceState({}, {}, OrderedSet())
+                    else State(OrderedSet(), OrderedSet(), OrderedSet())
                 )
                 return jump_branch | not_jump_branch
             elif instr.opname == "RETURN_VALUE":
                 return state
         return state
 
-    state = walk(root_state, start_instr_idx)
-    all_used_vars = {}
-    all_used_vars.update(state.writes)
-    all_used_vars.update(state.reads)
-    return all_used_vars
+    state = walk(root_state, current_instr_idx)
+    return state.reads, state.writes
diff --git a/test/sot/test_11_jumps.py b/test/sot/test_11_jumps.py
index 80fa1f4a4eb02..6073766e8b60f 100644
--- a/test/sot/test_11_jumps.py
+++ b/test/sot/test_11_jumps.py
@@ -114,5 +114,17 @@ def test_breakgraph(self):
         self.assert_results(pop_jump_if_not_none, true_tensor, a)
 
 
+def new_var_in_if():
+    x = paddle.to_tensor(1)
+    if x > 0:
+        y = 1
+    return y
+
+
+class TestCreateVarInIf(TestCaseBase):
+    def test_case(self):
+        self.assert_results(new_var_in_if)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/sot/test_analysis_inputs.py b/test/sot/test_analysis_inputs.py
index 20b32c2225324..880de6060d400 100644
--- a/test/sot/test_analysis_inputs.py
+++ b/test/sot/test_analysis_inputs.py
@@ -20,7 +20,7 @@
 
 import paddle
 from paddle.jit.sot.opcode_translator.instruction_utils import (
-    analysis_inputs,
+    analysis_used_names,
     calc_offset_from_bytecode_offset,
     get_instructions,
 )
@@ -36,12 +36,12 @@ def assert_inputs_equals(instruction_offset: int, expected_inputs: set[str]):
     current_instr_idx = calc_offset_from_bytecode_offset(
         test_frame.f_lasti + 2, instructions
     )
-    actual_inputs = analysis_inputs(
+    reads, writes = analysis_used_names(
         instructions, current_instr_idx + instruction_offset
     )
     assert (
-        set(actual_inputs) == expected_inputs
-    ), f"actual_inputs: {actual_inputs}, expected_inputs: {expected_inputs}"
+        set(reads) == expected_inputs
+    ), f"actual_inputs: {reads}, expected_inputs: {expected_inputs}"
 
 
 def case1(x):

From f4abfbec2489c269ec0082f87f5ba53e90eb5f6e Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Mon, 26 Feb 2024 11:33:48 +0800
Subject: [PATCH 71/82] support decomp swiglu (#62026)

* support swiglu decomp

* support decomp swiglu

* add swiglu test case

* update prim op list

* fix test case

* fix bn test case
---
 .../decomp_interface_gen_op_list.py           |  2 +
 paddle/fluid/primitive/base/primitive_ops.h   |  2 +
 paddle/fluid/primitive/composite/composite.h  | 13 +++++
 .../paddle/jit/sot/utils/paddle_api_config.py |  1 +
 .../test_batch_norm_op_prim_nchw.py           |  5 +-
 .../test_batch_norm_op_prim_nhwc.py           |  1 +
 .../test_prim_sub_graph_dynamic_shape.py      | 50 ++++++++++++++++---
 7 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
index b40e8b4d3dea2..9af8dfa12d702 100644
--- a/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/decomp_interface_gen_op_list.py
@@ -42,6 +42,7 @@
     "rsqrt",
     "sigmoid",
     "silu",
+    "swiglu",
     "softmax",
     "sqrt",
     "square",
@@ -74,6 +75,7 @@
     "rsqrt",
     "sigmoid",
     "silu",
+    "swiglu",
     "softmax",
     "sqrt",
     "square",
diff --git a/paddle/fluid/primitive/base/primitive_ops.h b/paddle/fluid/primitive/base/primitive_ops.h
index 947970ed92790..d477c32a62258 100644
--- a/paddle/fluid/primitive/base/primitive_ops.h
+++ b/paddle/fluid/primitive/base/primitive_ops.h
@@ -69,6 +69,7 @@ const std::set<std::string>& GetPrimitiveOpNames() {
       "pd_op.cos",
       "pd_op.where",
       "pd_op.split",
+      "pd_op.split_with_num",
       "pd_op.reshape",
       "pd_op.erf",
       "pd_op.tanh",
@@ -79,6 +80,7 @@ const std::set<std::string>& GetPrimitiveOpNames() {
       "pd_op.shape",
       "pd_op.full",
       "pd_op.full_int_array",
+      "pd_op.full_with_tensor",
       "pd_op.if",
       "pd_op.while",
       /* basic ops by PIR*/
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index 28983fa3cfd63..4fe8ec04a6031 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -335,6 +335,19 @@ Tensor silu_decomp(const Tensor& x) {
   }
 }
 
+template <typename T>
+Tensor swiglu_decomp(const Tensor& x, const paddle::optional<Tensor>& y) {
+  auto y_ptr = y.get_ptr();
+  if (y_ptr) {
+    return silu_decomp<T>(x) * y.get();
+  } else {
+    int axis = x.shape().size() - 1;
+    int num = 2;
+    std::vector<Tensor> xs = backend::split_with_num<T>(x, num, axis);
+    return silu_decomp<T>(xs[0]) * xs[1];
+  }
+}
+
 template <typename T>
 Tensor relu_decomp(const Tensor& x) {
   return maximum<T>(x, full<T>(empty_shape, 0.0, x.dtype()));
diff --git a/python/paddle/jit/sot/utils/paddle_api_config.py b/python/paddle/jit/sot/utils/paddle_api_config.py
index e21648f1a6ce6..8a5cde9e65716 100644
--- a/python/paddle/jit/sot/utils/paddle_api_config.py
+++ b/python/paddle/jit/sot/utils/paddle_api_config.py
@@ -34,6 +34,7 @@ def get_paddle_api():
     modules = [
         paddle,
         paddle.nn.functional,
+        paddle.incubate.nn.functional,
         paddle.linalg,
         paddle.signal,
         paddle.fft,
diff --git a/test/legacy_test/test_batch_norm_op_prim_nchw.py b/test/legacy_test/test_batch_norm_op_prim_nchw.py
index 3520e1e25aa77..06c42f221dfa2 100644
--- a/test/legacy_test/test_batch_norm_op_prim_nchw.py
+++ b/test/legacy_test/test_batch_norm_op_prim_nchw.py
@@ -67,7 +67,7 @@ def setUp(self):
         self.python_out_sig = ["Y"]
         # (Todo: CZ) random error
         self.check_prim_pir = False
-        self.check_prim_pir_grad = True
+        self.check_prim_pir_grad = False
         self.check_cpu_prim_pir_grad = False
 
         self.initConfig()
@@ -284,6 +284,7 @@ def initConfig(self):
         self.use_global_stats = None
         self.check_prim_pir = True
         self.check_cpu_prim_pir_grad = True
+        self.check_prim_pir_grad = True
 
 
 class TestBatchNormOpNCHWTestModeFp64(TestBatchNormOp):
@@ -357,7 +358,7 @@ def initConfig(self):
         self.use_global_stats = None
         # Todo(CZ): open this
         self.check_prim_pir = False
-        self.check_cpu_prim_pir_grad = True
+        self.check_cpu_prim_pir_grad = False
 
 
 @unittest.skipIf(
diff --git a/test/legacy_test/test_batch_norm_op_prim_nhwc.py b/test/legacy_test/test_batch_norm_op_prim_nhwc.py
index 01ad65e41b657..cefacd02b2a91 100644
--- a/test/legacy_test/test_batch_norm_op_prim_nhwc.py
+++ b/test/legacy_test/test_batch_norm_op_prim_nhwc.py
@@ -131,6 +131,7 @@ def initConfig(self):
         self.data_format = "NHWC"
         self.use_global_stats = None
         self.check_prim_pir = True
+        self.check_prim_pir_grad = True
         self.check_cpu_prim_pir_grad = True
 
 
diff --git a/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py b/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
index 6be76dd54af38..155cfbdeeb268 100644
--- a/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
+++ b/test/prim/pir_prim/test_prim_sub_graph_dynamic_shape.py
@@ -84,7 +84,15 @@ def index_sample_net(x, index):
     return paddle.index_sample(x, index)
 
 
-class TestPrimOne(unittest.TestCase):
+def swiglu_net1(x, y):
+    return paddle.incubate.nn.functional.swiglu(x, y)
+
+
+def swiglu_net2(x):
+    return paddle.incubate.nn.functional.swiglu(x)
+
+
+class TestPrimBase(unittest.TestCase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
@@ -130,7 +138,7 @@ def test_prim_all_dynamic(self):
             np.testing.assert_allclose(ref, actual, rtol=1e-6)
 
 
-class TestPrimOne2(TestPrimOne):
+class TestPrimAny(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "bool"
@@ -142,7 +150,7 @@ def setUp(self):
         self.enable_cinn = False
 
 
-class TestEmbeddingPrimOne3(TestPrimOne):
+class TestEmbedding(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "int"
@@ -154,7 +162,7 @@ def setUp(self):
         self.enable_cinn = False
 
 
-class TestPrimOne3(TestPrimOne):
+class TestPrimFullLike(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
@@ -166,7 +174,7 @@ def setUp(self):
         self.enable_cinn = False
 
 
-class TestPrimOne4(TestPrimOne):
+class TestPrimStack(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
@@ -178,7 +186,7 @@ def setUp(self):
         self.enable_cinn = False
 
 
-class TestPrimOne5(TestPrimOne):
+class TestPrimTile(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
@@ -190,7 +198,7 @@ def setUp(self):
         self.enable_cinn = False
 
 
-class TestPrimOne6(TestPrimOne):
+class TestPrimTile2(TestPrimBase):
     def setUp(self):
         np.random.seed(2023)
         self.dtype = "float32"
@@ -269,5 +277,33 @@ def setUp(self):
         self.enable_cinn = False
 
 
+class TestPrimSwiglu1(TestPrimTwo):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [300, 4096]
+        self.shape_y = [300, 4096]
+        self.dtype_x = "float32"
+        self.dtype_y = "float32"
+        self.init_x_shape = [None, None]
+        self.init_y_shape = [None, None]
+        self.x = np.random.random(self.shape_x).astype(self.dtype_x)
+        self.y = np.random.random(self.shape_y).astype(self.dtype_y)
+        self.net = swiglu_net1
+        self.necessary_ops = "pd_op.swiglu"
+        self.enable_cinn = False
+
+
+class TestPrimSwiglu2(TestPrimBase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [300, 4096]
+        self.dtype_x = "float32"
+        self.init_x_shape = [None, 4096]
+        self.x = np.random.random(self.shape_x).astype(self.dtype_x)
+        self.net = swiglu_net2
+        self.necessary_ops = "pd_op.swiglu"
+        self.enable_cinn = False
+
+
 if __name__ == "__main__":
     unittest.main()

From 467c94bcbb607193e477fac10db53957ec4cdf0d Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Mon, 26 Feb 2024 11:39:03 +0800
Subject: [PATCH 72/82] [clang-tidy] NO.38-40 enable `trivially-destructible`,
 `modernize-make-unique`,`modernize-avoid-bind` (#61556)

* fix

* fix

* fix

* fix

* fix

* fix
---
 .clang-tidy                                              | 2 +-
 paddle/fluid/framework/new_executor/new_executor_defs.cc | 2 --
 paddle/fluid/framework/new_executor/new_executor_defs.h  | 2 +-
 paddle/fluid/imperative/amp_auto_cast.cc                 | 2 --
 paddle/fluid/imperative/amp_auto_cast.h                  | 2 +-
 paddle/pir/src/core/parser/ir_parser.cc                  | 4 ++--
 6 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index 2ed67098e2a02..1653cef5fa1aa 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -198,7 +198,7 @@ performance-move-const-arg,
 -performance-move-constructor-init,
 -performance-no-automatic-move,
 performance-noexcept-move-constructor,
--performance-trivially-destructible,
+performance-trivially-destructible,
 -performance-type-promotion-in-math-fn,
 -performance-unnecessary-copy-initialization,
 readability-container-size-empty,
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index bfa7542b65b75..b3ec52029bb5b 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -39,8 +39,6 @@ VariableScope::VariableScope(Scope* scope) {
           "You have passed a nullptr to construct VariableScope."));
 }
 
-VariableScope::~VariableScope() = default;
-
 Scope* VariableScope::GetMutableScope() const { return scope_; }
 
 Scope* VariableScope::GetMutableLocalScope() const { return local_scope_; }
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index df82aedfcec5f..c416b151aef03 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -82,7 +82,7 @@ class VariableScope {
 
   void SetLocalScope(Scope* local_scope);
 
-  ~VariableScope();
+  ~VariableScope() = default;
 
   // Get variable id by name, return -1 if not found
   int GetIdByName(const std::string& name) const;
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 0dd5bc5779d21..50df994014004 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -162,8 +162,6 @@ AmpOperators::AmpOperators()
           << unsupported_bf16_ops_->size();
 }
 
-AmpOperators::~AmpOperators() = default;
-
 AmpOperators& AmpOperators::Instance() {
   static AmpOperators instance;
   return instance;
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index 1864f990576b1..eda10499f90d6 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -45,7 +45,7 @@ class Tracer;
 // Singleton implementation with C++ 11
 class AmpOperators {
  public:
-  ~AmpOperators();
+  ~AmpOperators() = default;
   AmpOperators(const AmpOperators& o) = delete;
   const AmpOperators& operator=(const AmpOperators& o) = delete;
 
diff --git a/paddle/pir/src/core/parser/ir_parser.cc b/paddle/pir/src/core/parser/ir_parser.cc
index 5d52da81e8582..3f45573509305 100644
--- a/paddle/pir/src/core/parser/ir_parser.cc
+++ b/paddle/pir/src/core/parser/ir_parser.cc
@@ -18,9 +18,9 @@
 
 namespace pir {
 IrParser::IrParser(IrContext* ctx, std::istream& is) {
-  lexer.reset(new Lexer{is});
+  lexer = std::make_unique<Lexer>(is);
   this->ctx = ctx;
-  builder.reset(new Builder{ctx});
+  builder = std::make_unique<Builder>(ctx);
 }
 
 Token IrParser::ConsumeToken() { return lexer->ConsumeToken(); }

From d7285b15ea6feb9cf350ec8838ce1867bb1f899c Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Mon, 26 Feb 2024 11:40:10 +0800
Subject: [PATCH 73/82] [clang-tidy] NO.25 enable
 modernize-use-transparent-functors (#61689)

* clangtidy 25

* codestyle

* codestyle

* fix

* fix
---
 paddle/common/ddim.cc                            |  2 +-
 paddle/phi/infermeta/spmd_rules/reshape.cc       |  6 +++---
 paddle/pir/src/core/ir_printer.cc                | 16 ++++++++--------
 .../fluid/memory/thread_local_allocator_test.cc  |  2 +-
 .../api/analyzer_capi_exp_pd_tensor_tester.cc    |  2 +-
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/common/ddim.cc b/paddle/common/ddim.cc
index 1f83a1c93b88d..58ccbe17d8df0 100644
--- a/paddle/common/ddim.cc
+++ b/paddle/common/ddim.cc
@@ -248,7 +248,7 @@ DDim DDim::reshape(std::vector<int>& shape) const {
   if (it != shape.end()) {
     int index = static_cast<int>(std::distance(shape.begin(), it));
     int reshape_out_product =
-        std::accumulate(shape.begin(), shape.end(), -1, std::multiplies<int>());
+        std::accumulate(shape.begin(), shape.end(), -1, std::multiplies<>());
     shape[index] = static_cast<int>(product(in_dims)) / reshape_out_product;
   }
 
diff --git a/paddle/phi/infermeta/spmd_rules/reshape.cc b/paddle/phi/infermeta/spmd_rules/reshape.cc
index 41a263a1ae35b..3d3bfb38d22fa 100644
--- a/paddle/phi/infermeta/spmd_rules/reshape.cc
+++ b/paddle/phi/infermeta/spmd_rules/reshape.cc
@@ -44,8 +44,8 @@ std::vector<int64_t> InferTargetShape(const std::vector<int64_t>& shape,
     }
   }
 
-  int64_t product = std::accumulate(
-      shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+  int64_t product =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>());
   if (product > 0) {
     PADDLE_ENFORCE_EQ(
         product,
@@ -72,7 +72,7 @@ std::vector<std::shared_ptr<DimTrans>> MakeReshapeDimTrans(
     const std::vector<int64_t>& tgt_shape) {
   std::vector<std::shared_ptr<DimTrans>> ret;
   int64_t total_elem_num_src = std::accumulate(
-      src_shape.begin(), src_shape.end(), 1, std::multiplies<int64_t>());
+      src_shape.begin(), src_shape.end(), 1, std::multiplies<>());
   std::vector<int64_t> inferred_tgt_shape =
       InferTargetShape(tgt_shape, total_elem_num_src);
 
diff --git a/paddle/pir/src/core/ir_printer.cc b/paddle/pir/src/core/ir_printer.cc
index b7cf6404818b5..c1a0fcd905ac8 100644
--- a/paddle/pir/src/core/ir_printer.cc
+++ b/paddle/pir/src/core/ir_printer.cc
@@ -69,7 +69,7 @@ void BasicIrPrinter::PrintType(Type type) {
   } else if (type.isa<VectorType>()) {
     os << "vec[";
     auto inner_types = type.dyn_cast<VectorType>().data();
-    detail::PrintInterleave(
+    pir::detail::PrintInterleave(
         inner_types.begin(),
         inner_types.end(),
         [this](Type v) { this->PrintType(v); },
@@ -132,7 +132,7 @@ void BasicIrPrinter::PrintAttribute(Attribute attr) {
   } else if (auto arr = attr.dyn_cast<ArrayAttribute>()) {
     const auto& vec = arr.AsVector();
     os << "[";
-    detail::PrintInterleave(
+    pir::detail::PrintInterleave(
         vec.begin(),
         vec.end(),
         [this](Attribute v) { this->PrintAttribute(v); },
@@ -256,7 +256,7 @@ void IrPrinter::PrintOpResult(Operation* op) {
   for (size_t idx = 0; idx < num_op_result; idx++) {
     op_results.push_back(op->result(idx));
   }
-  detail::PrintInterleave(
+  pir::detail::PrintInterleave(
       op_results.begin(),
       op_results.end(),
       [this](Value v) { this->PrintValue(v); },
@@ -266,11 +266,11 @@ void IrPrinter::PrintOpResult(Operation* op) {
 
 void IrPrinter::PrintAttributeMap(Operation* op) {
   AttributeMap attributes = op->attributes();
-  std::map<std::string, Attribute, std::less<std::string>> order_attributes(
+  std::map<std::string, Attribute, std::less<>> order_attributes(
       attributes.begin(), attributes.end());
   os << " {";
 
-  detail::PrintInterleave(
+  pir::detail::PrintInterleave(
       order_attributes.begin(),
       order_attributes.end(),
       [this](std::pair<std::string, Attribute> it) {
@@ -291,7 +291,7 @@ void IrPrinter::PrintOpOperands(Operation* op) {
   for (size_t idx = 0; idx < num_op_operands; idx++) {
     op_operands.push_back(op->operand_source(idx));
   }
-  detail::PrintInterleave(
+  pir::detail::PrintInterleave(
       op_operands.begin(),
       op_operands.end(),
       [this](Value v) { this->PrintValue(v); },
@@ -312,7 +312,7 @@ void IrPrinter::PrintOperandsType(Operation* op) {
     }
   }
   os << " (";
-  detail::PrintInterleave(
+  pir::detail::PrintInterleave(
       op_operand_types.begin(),
       op_operand_types.end(),
       [this](Type t) { this->PrintType(t); },
@@ -332,7 +332,7 @@ void IrPrinter::PrintOpReturnType(Operation* op) {
       op_result_types.emplace_back(nullptr);
     }
   }
-  detail::PrintInterleave(
+  pir::detail::PrintInterleave(
       op_result_types.begin(),
       op_result_types.end(),
       [this](Type t) { this->PrintType(t); },
diff --git a/test/cpp/fluid/memory/thread_local_allocator_test.cc b/test/cpp/fluid/memory/thread_local_allocator_test.cc
index c322295892da3..c6cb4cf0acf20 100644
--- a/test/cpp/fluid/memory/thread_local_allocator_test.cc
+++ b/test/cpp/fluid/memory/thread_local_allocator_test.cc
@@ -78,7 +78,7 @@ TEST(ThreadLocalAllocator, cross_scope_release) {
   for (auto &addresses : allocator_addresses) {
     std::sort(addresses.begin(), addresses.end());
     ASSERT_EQ(std::adjacent_find(
-                  addresses.begin(), addresses.end(), std::equal_to<void *>()),
+                  addresses.begin(), addresses.end(), std::equal_to<>()),
               addresses.end());
   }
 
diff --git a/test/cpp/inference/api/analyzer_capi_exp_pd_tensor_tester.cc b/test/cpp/inference/api/analyzer_capi_exp_pd_tensor_tester.cc
index 0b69c235a03fc..7a32aefb16d30 100644
--- a/test/cpp/inference/api/analyzer_capi_exp_pd_tensor_tester.cc
+++ b/test/cpp/inference/api/analyzer_capi_exp_pd_tensor_tester.cc
@@ -69,7 +69,7 @@ void PD_run() {
   int32_t out_num = std::accumulate(output_shape->data,
                                     output_shape->data + output_shape->size,
                                     1,
-                                    std::multiplies<int32_t>());
+                                    std::multiplies<>());
   out_data.resize(out_num);
   PD_TensorCopyToCpuFloat(output_tensor, out_data.data());
   LOG(INFO) << "Output tensor name is: " << PD_TensorGetName(output_tensor);

From 2823a59f63af9190d7df93d36c394a0b886c333e Mon Sep 17 00:00:00 2001
From: Sonder <55493212+AndSonder@users.noreply.github.com>
Date: Mon, 26 Feb 2024 12:30:30 +0800
Subject: [PATCH 74/82] [AutoParallel] Fit allreduce_matmul_grad_overlapping
 when using master grad (#61865)

* remove sync_with_cpp

* fix allreduce matmul grad overlaping when open master_grad

* add annotation

* update universal codes
---
 .../allreduce_matmul_grad_overlapping.py      | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
index 48e837fa3c46f..89e6c20ad03c9 100644
--- a/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
+++ b/python/paddle/distributed/passes/allreduce_matmul_grad_overlapping.py
@@ -133,6 +133,53 @@ def _split_matmul_grad_and_multi_streaming_allreduce(
             matmul_grad_op = ops[matmul_grad_id]
             allreduce_op = ops[allreduce_id]
 
+            # NOTE(Sonder): Why move those operations to the back of matmul_v2?
+            # When using amp_master_grad, the cast operation is inserted after matmul_grad.
+            # However, when employing allreduce_matmul_grad_overlapping, the matmul_grad is
+            # split into two matmul operations. In this case, some operations would access
+            # uninitialized tensors. Therefore, we move the cast operation to the back of the
+            # second matmul operation to avoid this problem.
+            skip_overlapping = False
+            moved_ops_idx = []
+            moved_ops_output = []
+            matmul_grad_output = matmul_grad_op.output('Y@GRAD')[0]
+
+            for idx in range(matmul_grad_id + 1, allreduce_id):
+                if matmul_grad_output in ops[idx].desc.input_arg_names():
+                    moved_ops_idx.append(idx)
+                    moved_ops_output.extend(ops[idx].desc.output_arg_names())
+                else:
+                    for input_name in ops[idx].desc.input_arg_names():
+                        if input_name in moved_ops_output:
+                            skip_overlapping = True
+
+            if skip_overlapping:
+                continue
+
+            for i, idx in enumerate(moved_ops_idx):
+                op = ops[idx]
+                dist_attr = self.dist_context.get_op_dist_attr_for_program(op)
+
+                op_inputs = op.desc.input_names()
+                op_outputs = op.desc.output_names()
+
+                op_inputs = {name: op.input(name) for name in op_inputs}
+                op_outputs = {name: op.output(name) for name in op_outputs}
+
+                op = block._insert_op_without_sync(
+                    index=allreduce_id + 1 + i,
+                    type=op.type,
+                    inputs=op_inputs,
+                    outputs=op_outputs,
+                    attrs=op.all_attrs(),
+                )
+
+                self.dist_context.set_op_dist_attr_for_program(op, dist_attr)
+
+            for i, idx in enumerate(moved_ops_idx):
+                block._remove_op(idx - i, sync=False)
+                allreduce_id -= 1
+
             tran_x = matmul_grad_op.attr("trans_x")
             assert (
                 not tran_x

From 488f2d536f0f794fdbb787785af3e14f95d767c5 Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Mon, 26 Feb 2024 13:15:06 +0800
Subject: [PATCH 75/82] set default in p2p_overlap (#62051)

---
 paddle/fluid/framework/distributed_strategy.proto | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 58460fcf9064b..27c7a7a7af276 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -82,7 +82,7 @@ message PpConfig {
     optional bool sharding_comm_overlap = 4 [ default = false ];
     optional bool profiling = 5 [ default = false ];
     optional bool release_gradients = 6 [ default = false ];
-    optional bool overlap_p2p_comm = 7 [default = false];
+    optional bool overlap_p2p_comm = 7 [default = true];
 }
 
 message DygraphShardingConfig {

From 082f95490d5c347a69d2566a62126771755245ea Mon Sep 17 00:00:00 2001
From: Ghost Screaming <mofengshenjieII@163.com>
Date: Mon, 26 Feb 2024 14:20:32 +0800
Subject: [PATCH 76/82] [AutoParallel] Fix problems of pp. (#61840)

* [AutoParallel] Fix inplace full_ in pp.

* [AutoParallel] Fix problem of PHI::DatatYPE::UNDEFINED.

* Polish code.

* Fix problem of split_with_num FillZeroForEmptyGradInput.
---
 .../auto_code_generator/generator/eager_gen.py     | 11 +++++------
 paddle/fluid/eager/grad_node_info.cc               |  2 +-
 paddle/phi/api/yaml/generator/dist_api_gen.py      |  7 ++++++-
 paddle/phi/infermeta/multiary.cc                   | 14 ++++++++++----
 paddle/phi/infermeta/ternary.cc                    |  5 +++++
 5 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 62327c5aa8785..13ddbca4c9ef5 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -554,8 +554,10 @@ class {} : public egr::GradNodeBase {{
   }}
 """
 CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE = """
-  paddle::optional<paddle::Tensor> {}_optional;
-  if({}.initialized()) {}_optional = paddle::make_optional<paddle::Tensor>({});
+  paddle::optional<paddle::Tensor> {name}_optional;
+  if({name}.initialized() ||
+     ({name}.defined() && {name}.is_dist_tensor() &&
+      phi::distributed::NeedComputationClipForPP({name}.impl()))) {name}_optional = paddle::make_optional<paddle::Tensor>({name});
 """
 
 CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE = """
@@ -2434,10 +2436,7 @@ def GenerateNodeDefinition(
                     get_tensor_str += (
                         "\n"
                         + CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE.format(
-                            transformed_tensor_name,
-                            transformed_tensor_name,
-                            transformed_tensor_name,
-                            transformed_tensor_name,
+                            name=transformed_tensor_name
                         )
                     )
                     grad_api_args[
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 78e3dd32fd40e..2a97f5bf35e90 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -196,7 +196,7 @@ void GradNodeBase::SetGradInMeta(const std::vector<paddle::Tensor>& fwd_out,
 
     if (!fwd_out_tensor.initialized()) {
       if (fwd_out_tensor.defined() && fwd_out_tensor.is_dist_tensor() &&
-          !phi::distributed::NeedComputationClipForPP(fwd_out_tensor.impl())) {
+          phi::distributed::NeedComputationClipForPP(fwd_out_tensor.impl())) {
         VLOG(3) << "Tensor " << fwd_out_tensor.name() << " is DistTensor,"
                 << " and needs computation clip for pipeline parallel."
                 << " Still SetGradInMeta for it.";
diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
index 03d65a920b9d2..e199c5c1a520d 100644
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -596,6 +596,7 @@ def parse_infer_meta(self, infer_meta_config):
     def need_to_generate_code_for_inplace_impl(self, i):
         return (
             self.inplace_flag
+            and self.kernel['func'][0] != 'full'
             and self.inplace_map is not None
             and self.outputs['names'][i] in self.inplace_map
         )
@@ -1023,7 +1024,11 @@ def generate_output_creation_code(self) -> str:
         output_creation_code += "\n    phi::DeviceContext* dev_ctx = nullptr;"
         if output_num == 1:
             # api output generate
-            if self.need_to_generate_code_for_inplace_impl(0):
+            if (
+                self.inplace_flag
+                and self.inplace_map is not None
+                and self.outputs['names'][0] in self.inplace_map
+            ):
                 inplace_assign_code = (
                     " = " + self.inplace_map[self.outputs['names'][0]]
                 )
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 978a80674272f..b7a5dd51de901 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -4512,17 +4512,23 @@ void FusedRopeInferMeta(const MetaTensor& q,
                         "Input should be a 4-D tensor of format [N, C, H, W] "
                         "or [N, H, W, C], but got %u.",
                         input_dims.size()));
-  if (q) {
-    out_q->set_dims(q.dims());
-    out_q->set_dtype(q.dtype());
-  }
+  out_q->set_dims(q.dims());
+  out_q->set_dtype(q.dtype());
   if (k) {
     out_k->set_dims(k.dims());
     out_k->set_dtype(k.dtype());
+  } else {
+    if (out_k) {
+      out_k->set_dtype(q.dtype());
+    }
   }
   if (v) {
     out_v->set_dims(v.dims());
     out_v->set_dtype(v.dtype());
+  } else {
+    if (out_v) {
+      out_v->set_dtype(q.dtype());
+    }
   }
 }
 
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 5701ffd4da5d2..b728c33abf2e2 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -293,6 +293,11 @@ void FlashAttnInferMeta(const MetaTensor& q,
   out->set_dims(out_dims);
   out->set_dtype(q.dtype());
   out->set_layout(q.layout());
+  softmax->set_dtype(q.dtype());
+  softmax_lse->set_dtype(q.dtype());
+  if (seed_offset) {
+    seed_offset->set_dtype(phi::DataType::INT64);
+  }
 }
 
 void ArangeTensorInferMeta(const MetaTensor& start,

From 194ef8baa4a6e94d9ef70e05554e5d3e2ac909f9 Mon Sep 17 00:00:00 2001
From: RichardWooSJTU <37864677+RichardWooSJTU@users.noreply.github.com>
Date: Mon, 26 Feb 2024 14:28:35 +0800
Subject: [PATCH 77/82] Fix llm.int8 unit test (#61591)

* fix llm.int8 unit test

* fix llm.int8 unnittest when cpu

* fix numerical mismatch

* code clean
---
 .../phi/kernels/cpu/weight_quantize_kernel.cc |  17 ++-
 .../phi/kernels/gpu/weight_quantize_kernel.cu |   5 +-
 .../impl/weight_quantize_kernel_gpu_impl.h    |  12 +-
 .../impl/weight_quantize_kernel_impl.h        |  20 +--
 test/quantization/test_llm_int8_linear.py     | 129 +++++++++---------
 5 files changed, 100 insertions(+), 83 deletions(-)

diff --git a/paddle/phi/kernels/cpu/weight_quantize_kernel.cc b/paddle/phi/kernels/cpu/weight_quantize_kernel.cc
index 313c59e2e6676..61304e43d4e85 100644
--- a/paddle/phi/kernels/cpu/weight_quantize_kernel.cc
+++ b/paddle/phi/kernels/cpu/weight_quantize_kernel.cc
@@ -22,7 +22,11 @@ limitations under the License. */
 
 namespace phi {
 
-template <typename DeviceContext, typename T, typename D, int bits>
+template <typename DeviceContext,
+          typename T,
+          typename D,
+          int bits,
+          typename ScaleT = T>
 void quant_compute(const DeviceContext& dev_ctx,
                    const DenseTensor& x,
                    DenseTensor* out,
@@ -48,7 +52,7 @@ void quant_compute(const DeviceContext& dev_ctx,
   DDim dims = {num};
   const T* x_data = x.data<T>();
   D* out_data = out->data<D>();
-  T* scale_data = scale->data<T>();
+  ScaleT* scale_data = scale->data<ScaleT>();
 
   DenseTensor x_int(out->type());
 
@@ -121,11 +125,16 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                           DenseTensor* out,
                           DenseTensor* scale) {
   dev_ctx.template Alloc<int8_t>(out);
-  dev_ctx.template Alloc<T>(scale);
-  if (algo == "weight_only_int8" || algo == "llm.int8") {
+  if (algo == "weight_only_int8") {
+    dev_ctx.template Alloc<T>(scale);
     quant_compute<Context, T, int8_t, 8>(
         dev_ctx, x, out, scale, algo, arch, group_size);
+  } else if (algo == "llm.int8") {
+    dev_ctx.template Alloc<float>(scale);
+    quant_compute<Context, T, int8_t, 8, float>(
+        dev_ctx, x, out, scale, algo, arch, group_size);
   } else if (algo == "weight_only_int4") {
+    dev_ctx.template Alloc<T>(scale);
     quant_compute<Context, T, int8_t, 4>(
         dev_ctx, x, out, scale, algo, arch, group_size);
   } else {
diff --git a/paddle/phi/kernels/gpu/weight_quantize_kernel.cu b/paddle/phi/kernels/gpu/weight_quantize_kernel.cu
index 8cd5598e2e92a..103691f9cd8a4 100644
--- a/paddle/phi/kernels/gpu/weight_quantize_kernel.cu
+++ b/paddle/phi/kernels/gpu/weight_quantize_kernel.cu
@@ -37,7 +37,6 @@ void WeightQuantizeKernel(const Context& dev_ctx,
 
   DenseTensor quanted_x;
   dev_ctx.template Alloc<int8_t>(out);
-  dev_ctx.template Alloc<T>(scale);
   size_t m = x.dims()[0];
   size_t n = x.dims()[1];
   quanted_x.Resize({static_cast<int64_t>(m), static_cast<int64_t>(n)});
@@ -51,15 +50,17 @@ void WeightQuantizeKernel(const Context& dev_ctx,
           "Currently, arch only support 70, 75, 80, 86."));
 
   if (algo == "llm.int8") {
+    dev_ctx.template Alloc<float>(scale);
     std::vector<int> axis = {1, 0};
     funcs::Transpose<Context, int8_t, 2> trans;
     weight_quant_gpu<T, Context>(dev_ctx,
                                  x.data<T>(),
                                  quanted_x.data<int8_t>(),
-                                 scale->data<T>(),
+                                 scale->data<float>(),
                                  weight_shape);
     trans(dev_ctx, quanted_x, out, axis);
   } else if (algo == "weight_only_int8") {
+    dev_ctx.template Alloc<T>(scale);
     weight_quant_gpu<T, Context>(dev_ctx,
                                  x.data<T>(),
                                  quanted_x.data<int8_t>(),
diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
index 201dd403270f3..05d0e47b31455 100644
--- a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
+++ b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
@@ -106,10 +106,10 @@ void weight_permute_gpu(const GPUContext& dev_ctx,
   }
 }
 
-template <typename T, int VectorSize = 8>
+template <typename T, int VectorSize = 8, typename ScaleT>
 __global__ void per_channel_quant_gpu(const T* weight_data,
                                       int8_t* quanted_weight_data,
-                                      T* scale_data,
+                                      ScaleT* scale_data,
                                       int total_k,
                                       int total_vec_n) {
   int n = blockIdx.x * blockDim.x + threadIdx.x;
@@ -133,10 +133,10 @@ __global__ void per_channel_quant_gpu(const T* weight_data,
         abs_max[i] = fmaxf((abs_max[i]), fabsf((weight[i])));
       }
     }
-    phi::AlignedVector<T, VectorSize> scale;
+    phi::AlignedVector<ScaleT, VectorSize> scale;
 #pragma unroll
     for (int i = 0; i < VectorSize; ++i) {
-      scale[i] = static_cast<T>(abs_max[i] / static_cast<float>(127.0f));
+      scale[i] = static_cast<ScaleT>(abs_max[i] / static_cast<float>(127.0f));
     }
     *reinterpret_cast<float4*>(scale_data + VectorSize * n) =
         *reinterpret_cast<float4*>(&scale);
@@ -161,11 +161,11 @@ __global__ void per_channel_quant_gpu(const T* weight_data,
     }
   }
 }
-template <typename T, typename GPUContext>
+template <typename T, typename GPUContext, typename ScaleT>
 void weight_quant_gpu(const GPUContext& dev_ctx,
                       const T* weight_data,
                       int8_t* quanted_weight_data,
-                      T* scale_data,
+                      ScaleT* scale_data,
                       const std::vector<int>& shape) {
   int total_k = shape[0];
   int total_n = shape[1];
diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h
index 2905fd14e6b33..6f7fc1e9c0680 100644
--- a/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h
+++ b/paddle/phi/kernels/impl/weight_quantize_kernel_impl.h
@@ -42,9 +42,9 @@ inline T xabs(const T x) {
   return x < static_cast<T>(0.0) ? -x : x;
 }
 
-template <typename T>
+template <typename T, typename ScaleT>
 void per_channel_scale(
-    T* scale, const T* input, size_t m, size_t n, float bound) {
+    ScaleT* scale, const T* input, size_t m, size_t n, float bound) {
   for (size_t i = 0; i < n; ++i) {
     float max = static_cast<float>(input[i]);
     for (size_t j = 0; j < m; ++j) {
@@ -52,12 +52,12 @@ void per_channel_scale(
                 ? static_cast<float>(xabs(input[j * n + i]))
                 : max;
     }
-    scale[i] = static_cast<T>(max / bound);
+    scale[i] = static_cast<ScaleT>(max / bound);
   }
 }
 
-template <typename T>
-void group_wise_scale(T* scale,
+template <typename T, typename ScaleT>
+void group_wise_scale(ScaleT* scale,
                       const T* input,
                       size_t m,
                       size_t n,
@@ -72,15 +72,15 @@ void group_wise_scale(T* scale,
                   : max;
       }
       scale[static_cast<int>(j / group_size) * n + i] =
-          static_cast<T>(max / bound);
+          static_cast<ScaleT>(max / bound);
     }
   }
 }
 
-template <typename T, int quant_bit = 8>
+template <typename T, int quant_bit = 8, typename ScaleT>
 void per_channel_quant(int8_t* output,
                        const T* input,
-                       const T* scale,
+                       const ScaleT* scale,
                        size_t num_rows,
                        size_t num_cols) {
   size_t bytes_per_out_col = num_cols * quant_bit / 8;
@@ -123,10 +123,10 @@ void per_channel_quant(int8_t* output,
   }
 }
 
-template <typename T, int quant_bit = 8>
+template <typename T, int quant_bit = 8, typename ScaleT>
 void group_wise_quant(int8_t* output,
                       const T* input,
-                      const T* scale,
+                      const ScaleT* scale,
                       size_t num_rows,
                       size_t num_cols,
                       const int group_size) {
diff --git a/test/quantization/test_llm_int8_linear.py b/test/quantization/test_llm_int8_linear.py
index 972c41bd31f52..909f44c0ca404 100644
--- a/test/quantization/test_llm_int8_linear.py
+++ b/test/quantization/test_llm_int8_linear.py
@@ -24,9 +24,6 @@
 from paddle.framework import set_default_dtype
 from paddle.pir_utils import test_with_pir_api
 
-np.random.seed(123)
-paddle.seed(42)
-
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
@@ -43,11 +40,13 @@ def config(self):
         self.batch = 1
         self.token = 32
         self.in_features = 64
-        self.out_features = 256
+        self.out_features = 128
         self.threshold = 6.0
         self.static = False
 
     def setUp(self):
+        np.random.seed(123)
+        paddle.seed(42)
         self.config()
         x = np.random.random((self.batch, self.token, self.in_features))
         self.x = paddle.to_tensor(x, dtype=self.dtype)
@@ -64,49 +63,89 @@ def setUp(self):
             self.in_features, self.out_features, bias_attr=bias_attr
         )
 
-        self.bias = self.linear.bias
         self.weight = self.linear.weight
         self.weight_scale = None
         self.weight, self.weight_scale = Q.weight_quantize(
             self.weight, algo="llm.int8"
         )
 
+    def dynamic_quant(self, x):
+        row_ranges = paddle.max(x, axis=[-1]).astype('float32')
+        row_ranges = row_ranges.unsqueeze(-1)
+        quant_x = paddle.round(
+            paddle.clip(
+                x.astype('float32') * 127.0 * (1 / row_ranges),
+                min=-127.0,
+                max=127.0,
+            )
+        ).astype('int8')
+        return quant_x, row_ranges
+
     def get_linear_out(self):
-        out = self.linear(self.x)
+        outlier_cols = (
+            paddle.nonzero(paddle.max(self.x, axis=[0, 1]) > self.threshold)
+            .reshape([-1])
+            .numpy()
+            .tolist()
+        )
+
+        x_int8 = self.x
+        if len(outlier_cols) > 0:
+            x_fp = self.x[:, :, outlier_cols]
+            w_fp = self.linear.weight[outlier_cols]
+            res_fp = paddle.matmul(x_fp, w_fp)
+
+            x_int8[:, :, outlier_cols] = 0
+        x_int8, row_ranges = self.dynamic_quant(x_int8)
+
+        res_int8 = paddle.matmul(x_int8, self.weight.transpose((1, 0)))
+        dequant_scale = row_ranges * self.weight_scale / 127.0
+        res_dequant = (res_int8.astype('float32') * dequant_scale).astype(
+            self.dtype
+        )
+
+        if len(outlier_cols) > 0:
+            out = res_dequant + res_fp
+        else:
+            out = res_dequant
+
+        if self.bias:
+            out += self.bias
+
         return out.numpy()
 
     def get_llm_int8_linear_out(self):
         out = Q.llm_int8_linear(
             self.x,
             self.weight,
-            bias=self.bias,
+            bias=self.linear.bias,
             weight_scale=self.weight_scale,
             threshold=self.threshold,
         )
         return out.numpy()
 
     @test_with_pir_api
-    def get_llm_int8_linear_out_static(self):
+    def llm_int8_linear_out_static(self, out_expect):
         paddle.enable_static()
-        main = base.static.Program()
-        start = base.static.Program()
-        with base.static.program_guard(main, start):
-            x = paddle.static.data("x", self.x.shape, dtype=self.x.dtype)
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data("x", self.x.shape, dtype=self.dtype)
 
             weight = paddle.static.data(
-                "weight", self.weight.shape, dtype=self.weight.dtype
+                "weight", self.weight.shape, dtype='int8'
             )
             bias = paddle.static.data(
-                "bias", self.bias.shape, dtype=self.bias.dtype
+                "bias", self.linear.bias.shape, dtype=self.dtype
             )
             x_np = self.x.numpy()
             weight_np = self.weight.numpy()
-            bias_np = self.bias.numpy()
+            bias_np = self.linear.bias.numpy()
             if self.weight_scale is not None:
                 weight_scale = paddle.static.data(
                     "weight_scale",
                     self.weight_scale.shape,
-                    dtype=self.weight_scale.dtype,
+                    dtype='float32',
                 )
                 weight_scale_np = self.weight_scale.numpy()
             else:
@@ -128,20 +167,30 @@ def get_llm_int8_linear_out_static(self):
             }
             exe = base.Executor(paddle.CUDAPlace(0))
             exe.run(start)
-            (out,) = exe.run(main, feed=feed_dict, fetch_list=[out])
+            (out_real,) = exe.run(main, feed=feed_dict, fetch_list=[out])
+
         paddle.disable_static()
-        return out
+
+        if self.dtype == "bfloat16":
+            out_real = convert_uint16_to_float(out_real)
+            out_expect = convert_uint16_to_float(out_expect)
+
+        np.testing.assert_allclose(
+            out_real, out_expect, rtol=self.rtol, atol=self.atol
+        )
 
     def test_llm_int8_linear(self):
         out_expect = self.get_linear_out()
         if self.static:
-            out_real = self.get_llm_int8_linear_out_static()
+            self.llm_int8_linear_out_static(out_expect)
+            return
         else:
             out_real = self.get_llm_int8_linear_out()
 
         if self.dtype == "bfloat16":
             out_real = convert_uint16_to_float(out_real)
             out_expect = convert_uint16_to_float(out_expect)
+
         np.testing.assert_allclose(
             out_real, out_expect, rtol=self.rtol, atol=self.atol
         )
@@ -174,19 +223,6 @@ def config(self):
         self.weight_dtype = "int8"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
-class LLMInt8LinearTestCase3(LLMInt8LinearTestCase):
-    def config(self):
-        super().config()
-        self.dtype = 'bfloat16'
-        self.weight_dtype = "int8"
-
-
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or get_cuda_version() < 11020
@@ -215,20 +251,6 @@ def config(self):
         self.weight_dtype = "int4"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8 or core is not support bfloat16",
-)
-class LLMInt8LinearTestCase6(LLMInt8LinearTestCase):
-    def config(self):
-        super().config()
-        self.dtype = 'bfloat16'
-        self.weight_dtype = "int4"
-
-
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or get_cuda_version() < 11020
@@ -260,21 +282,6 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
-class LLMInt8LinearTestCase9(LLMInt8LinearTestCase):
-    def config(self):
-        super().config()
-        self.dtype = 'bfloat16'
-        self.weight_dtype = "int8"
-        self.batch = 1
-        self.token = 1
-
-
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or get_cuda_version() < 11020

From 9247adeb0fb62cb91d27ee0acb5bd9c30ce854ce Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Mon, 26 Feb 2024 14:30:40 +0800
Subject: [PATCH 78/82] fix eb4 (#62032)

---
 .../distributed/fleet/layers/mpu/mp_layers.py |  8 ++----
 .../fleet/utils/sequence_parallel_utils.py    |  3 +-
 .../nn/functional/fused_matmul_bias.py        | 28 ++++++++++++++++---
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
index a24bbd3321439..fd66927ced6db 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -16,6 +16,7 @@
 
 import paddle
 from paddle.autograd import PyLayer
+from paddle.base import core
 from paddle.distributed import fleet
 from paddle.nn import functional as F
 
@@ -33,7 +34,7 @@
 
 
 def is_fused_matmul_bias_supported():
-    return hasattr(paddle._C_ops, 'fused_gemm_epilogue')
+    return hasattr(core.eager.ops.legacy, 'fused_gemm_epilogue')
 
 
 def is_fused_linear_param_grad_add_supported():
@@ -213,10 +214,7 @@ def forward(
         if not fuse_matmul_bias:
             return paddle._C_ops.linear(x, weight, bias)
         else:
-            result, _ = paddle._C_ops.fused_gemm_epilogue(
-                x, weight, bias, False, False, "none"
-            )
-            return result
+            return paddle._legacy_C_ops.fused_gemm_epilogue(x, weight, bias)
 
     @staticmethod
     def backward(ctx, dy):
diff --git a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
index f499054bc8496..940d7408ff5be 100644
--- a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
@@ -17,6 +17,7 @@
 import paddle
 from paddle import distributed as dist
 from paddle.autograd import PyLayer
+from paddle.base import core
 from paddle.distributed import fleet
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
 from paddle.distributed.fleet.utils.hybrid_parallel_util import (
@@ -221,7 +222,7 @@ def is_fused_matmul_bias_supported():
         and not paddle.is_compiled_with_rocm()
         or paddle.is_compiled_with_xpu()
     ):
-        return hasattr(paddle._C_ops, "fused_gemm_epilogue")
+        return hasattr(core.eager.ops.legacy, "fused_gemm_epilogue")
     else:
         return False
 
diff --git a/python/paddle/incubate/nn/functional/fused_matmul_bias.py b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
index 1b894ce297a1c..56d5e30a506ab 100644
--- a/python/paddle/incubate/nn/functional/fused_matmul_bias.py
+++ b/python/paddle/incubate/nn/functional/fused_matmul_bias.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import _C_ops
+from paddle import _C_ops, _legacy_C_ops
 from paddle.base.layer_helper import LayerHelper
-from paddle.framework import in_dynamic_or_pir_mode
+from paddle.framework import (
+    in_dynamic_mode,
+    in_pir_mode,
+)
 from paddle.tensor.linalg import matmul
 
 
@@ -56,7 +59,11 @@ def fused_matmul_bias(
     """
     if bias is None:
         return matmul(x, y, transpose_x, transpose_y, name)
-    if in_dynamic_or_pir_mode():
+    if in_dynamic_mode():
+        return _legacy_C_ops.fused_gemm_epilogue(
+            x, y, bias, 'trans_x', transpose_x, 'trans_y', transpose_y
+        )
+    if in_pir_mode():
         out, _ = _C_ops.fused_gemm_epilogue(
             x, y, bias, transpose_x, transpose_y, "none"
         )
@@ -146,7 +153,20 @@ def fused_linear_activation(
     if activation is None:
         activation = "none"
 
-    if in_dynamic_or_pir_mode():
+    if in_dynamic_mode():
+        return _legacy_C_ops.fused_gemm_epilogue(
+            x,
+            y,
+            bias,
+            'trans_x',
+            trans_x,
+            'trans_y',
+            trans_y,
+            'activation',
+            activation,
+        )
+
+    if in_pir_mode():
         out, _ = _C_ops.fused_gemm_epilogue(
             x,
             y,

From 1c5bbe428b7cf62a8f74e8f50386d515a9e10838 Mon Sep 17 00:00:00 2001
From: diadestiny <44188454+diadestiny@users.noreply.github.com>
Date: Mon, 26 Feb 2024 14:46:03 +0800
Subject: [PATCH 79/82] [SOT][3.12] Support `BINARY_SLICE` and `STORE_SLICE`
 opcode in Python 3.12 (#62028)

---
 .../executor/opcode_executor.py               | 36 +++++++++++++++++--
 test/sot/skip_files_py312                     |  1 -
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
index e0ada6a9b74fa..3b40633a73e25 100644
--- a/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
+++ b/python/paddle/jit/sot/opcode_translator/executor/opcode_executor.py
@@ -696,6 +696,21 @@ def BINARY_OP(self, instr: Instruction):
     def BINARY_SUBSCR(self, instr: Instruction):
         key = self.stack.pop()
         container = self.stack.pop()
+        self.binary_subscr_operation(key, container, instr.opname)
+
+    @call_break_graph_decorator(push_n=1)
+    def BINARY_SLICE(self, instr: Instruction):
+        end = self.stack.pop()
+        start = self.stack.pop()
+        container = self.stack.pop()
+        key = SliceVariable(
+            slice(start, end),
+            graph=self._graph,
+            tracker=DummyTracker([start, end]),
+        )
+        self.binary_subscr_operation(key, container, instr.opname)
+
+    def binary_subscr_operation(self, key, container, opname):
         assert isinstance(key, VariableBase)
         # TODO(xiongkun): getitem / getattr support key and attr as variable.
         if isinstance(key, TensorVariable) and isinstance(
@@ -710,7 +725,7 @@ def BINARY_SUBSCR(self, instr: Instruction):
 
         if isinstance(key, TensorVariable):
             raise BreakGraphError(
-                f"Key is a TensorVariable in BINARY_SUBSCR, {container}[{key}]"
+                f"Key is a TensorVariable in {opname}, {container}[{key}]"
             )
 
         result = BuiltinVariable(
@@ -884,11 +899,28 @@ def STORE_SUBSCR(self, instr: Instruction):
         key = self.stack.pop()
         container = self.stack.pop()
         value = self.stack.pop()
+        self.store_subscr_operation(key, container, value, instr.opname)
+
+    @call_break_graph_decorator(push_n=0)
+    def STORE_SLICE(self, instr: Instruction):
+        end = self.stack.pop()
+        start = self.stack.pop()
+        container = self.stack.pop()
+        value = self.stack.pop()
+
+        key = SliceVariable(
+            slice(start, end),
+            graph=self._graph,
+            tracker=DummyTracker([start, end]),
+        )
+        self.store_subscr_operation(key, container, value, instr.opname)
+
+    def store_subscr_operation(self, key, container, value, opname):
         assert isinstance(key, VariableBase)
         self._graph.add_global_guarded_variable(key)
         if isinstance(key, TensorVariable):
             raise BreakGraphError(
-                f"Key is a TensorVariable in STORE_SUBSCR, {container}[{key}] = {value}"
+                f"Key is a TensorVariable in {opname}, {container}[{key}] = {value}"
             )
         # TODO(xiongkun): support tensor[tensor] = tensor, dy2static is not the same with dygraph.
         container[key.get_py_value()] = value
diff --git a/test/sot/skip_files_py312 b/test/sot/skip_files_py312
index 815f3a9e68b49..59cd1a37055f4 100644
--- a/test/sot/skip_files_py312
+++ b/test/sot/skip_files_py312
@@ -2,7 +2,6 @@
 ./test_11_jumps.py
 ./test_12_for_loop.py
 ./test_14_operators.py
-./test_15_slice.py
 ./test_21_global.py
 ./test_analysis_inputs.py
 ./test_break_graph.py

From 2598a16e30a56b2c430b01a32ca1bb40c46bd488 Mon Sep 17 00:00:00 2001
From: ming1753 <61511741+ming1753@users.noreply.github.com>
Date: Mon, 26 Feb 2024 14:49:59 +0800
Subject: [PATCH 80/82] Inference with optimized model (#61598)

* Inference with optimized model
* clear gpu mem
* delete scale and zero_point in qdq ops
* add test
* modify API

* fix bug
---
 .../ir/delete_quant_dequant_linear_op_pass.cc |  14 +++
 ...rt_delete_weight_dequant_linear_op_pass.cc |  14 +++
 paddle/fluid/inference/analysis/argument.h    |   3 +
 .../passes/save_optimized_model_pass.cc       |  29 ++---
 paddle/fluid/inference/api/analysis_config.cc |   6 +
 .../fluid/inference/api/analysis_predictor.cc |  80 +++++++++++-
 .../fluid/inference/api/analysis_predictor.h  |   2 +
 .../inference/api/paddle_analysis_config.h    |  10 +-
 .../fluid/inference/api/paddle_pass_builder.h |  15 +--
 paddle/fluid/pybind/inference_api.cc          |   3 +
 .../inference/test_use_optimized_model_api.py | 116 ++++++++++++++++++
 11 files changed, 265 insertions(+), 27 deletions(-)
 create mode 100644 test/ir/inference/test_use_optimized_model_api.py

diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index 317beb100acb1..7358a82c6ca3c 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -126,6 +126,13 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
     */
     std::unordered_set<const Node*> nodes2rm = {};
 
+    // delete Scale and ZeroPoint tensor in scope
+    std::vector<std::string> vars2rm = {};
+    vars2rm.emplace_back(quantize_linear_op->Op()->Input("Scale")[0]);
+    vars2rm.emplace_back(quantize_linear_op->Op()->Input("ZeroPoint")[0]);
+    vars2rm.emplace_back(dequantize_linear_op->Op()->Input("Scale")[0]);
+    vars2rm.emplace_back(dequantize_linear_op->Op()->Input("ZeroPoint")[0]);
+
     // Get input scale from tensor
     const phi::DenseTensor& input_scale_tensor =
         scope->GetVar(quantize_linear_op_scale->Name())
@@ -175,6 +182,13 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
     nodes2rm.insert(dequantize_linear_op);
     nodes2rm.insert(dequantize_linear_op_out);
     GraphSafeRemoveNodes(graph, nodes2rm);
+
+    for (auto& var_name : vars2rm) {
+      if (scope->FindVar(var_name)) {
+        scope->EraseVars({var_name});
+      }
+    }
+
     found_count++;
   };
   gpd(graph, handler);
diff --git a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
index 99aa8a5002e85..6e12933f0f4d5 100644
--- a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
@@ -232,6 +232,13 @@ void TrtDeleteWeightQuantDequantLinearOpPass::ApplyImpl(
     }
     */
     std::unordered_set<const Node*> nodes2rm = {};
+
+    // delete Scale and ZeroPoint tensor in scope
+    std::vector<std::string> vars2rm = {};
+    vars2rm.emplace_back(weight_dequantize_linear_op->Op()->Input("Scale")[0]);
+    vars2rm.emplace_back(
+        weight_dequantize_linear_op->Op()->Input("ZeroPoint")[0]);
+
     int bit_length = PADDLE_GET_CONST(
         int, weight_dequantize_linear_op->Op()->GetAttr("bit_length"));
     int range = ((1 << (bit_length - 1)) - 1);
@@ -356,6 +363,13 @@ void TrtDeleteWeightQuantDequantLinearOpPass::ApplyImpl(
     }
 
     GraphSafeRemoveNodes(graph, nodes2rm);
+
+    for (auto& var_name : vars2rm) {
+      if (scope->FindVar(var_name)) {
+        scope->EraseVars({var_name});
+      }
+    }
+
     found_count++;
   };
   gpd(graph, handler);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 22a420f2de347..69b78b1ef33f3 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -165,6 +165,9 @@ struct Argument {
   DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
   DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
   DECL_ARGUMENT_FIELD(save_optimized_model, SaveOptimizedModel, bool);
+  DECL_ARGUMENT_FIELD(optimized_model_save_path,
+                      OptimizedModelSavePath,
+                      std::string);
   DECL_ARGUMENT_FIELD(optim_cache_dir, OptimCacheDir, std::string);
   DECL_ARGUMENT_FIELD(enable_ir_optim, EnableIrOptim, bool);
 
diff --git a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
index cad0296369479..cc463ce45f105 100644
--- a/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/save_optimized_model_pass.cc
@@ -24,22 +24,7 @@ namespace inference {
 namespace analysis {
 
 void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
-  std::string model_opt_cache_dir = argument->optim_cache_dir();
-  if (!model_opt_cache_dir.empty()) {
-    if (!PathExists(model_opt_cache_dir)) {
-      PADDLE_ENFORCE_NE(
-          MKDIR(model_opt_cache_dir.c_str()),
-          -1,
-          platform::errors::PreconditionNotMet(
-              "Can not create optimize cache directory: %s, Make sure you "
-              "have permission to write",
-              model_opt_cache_dir));
-    }
-  } else {
-    model_opt_cache_dir = argument->Has("model_dir")
-                              ? argument->model_dir()
-                              : GetDirRoot(argument->model_program_path());
-  }
+  std::string model_opt_cache_dir = argument->optimized_model_save_path();
 
   auto& scope = argument->scope();
   auto* graph = argument->main_graph_ptr();
@@ -52,6 +37,14 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
 
   framework::ir::GraphToProgram(*graph, &optimized_program_desc);
 
+  // Some vars may be deleted by pass, so we need to remove them in block
+  framework::BlockDesc* block = optimized_program_desc.MutableBlock(0);
+  for (auto& var_desc : block->AllVars()) {
+    if (var_desc->Persistable() && !scope.FindVar(var_desc->Name())) {
+      block->RemoveVar(var_desc->Name());
+    }
+  }
+
   auto IsPersistable = [](const framework::VarDesc* var) {
     if (var->Persistable() &&
         var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
@@ -81,7 +74,7 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
       }
     }
 
-    std::string save_params_path = path + "/" + "_optimized.pdiparams";
+    std::string save_params_path = path + ".pdiparams";
     std::vector<std::string> save_var_list(save_var_set.begin(),
                                            save_var_set.end());
     std::sort(save_var_list.begin(), save_var_list.end());
@@ -112,7 +105,7 @@ void SaveOptimizedModelPass::SaveOptimizedModel(Argument* argument) {
         }
       }
     }
-    std::string save_model_path = path + "/" + "_optimized.pdmodel";
+    std::string save_model_path = path + ".pdmodel";
     auto str = optimized_program_desc.Proto()->SerializeAsString();
     std::ofstream file(save_model_path.c_str(), std::ios::binary);
     file.write(str.c_str(), str.size());  // NOLINT
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 17841b8be5bad..7c1dad8a0d2b3 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -544,6 +544,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(ir_debug_);
   CP_MEMBER(specify_input_name_);
 
+  CP_MEMBER(use_optimized_model_);
+
   CP_MEMBER(cpu_math_library_num_threads_);
 
   CP_MEMBER(serialized_info_cache_);
@@ -1152,6 +1154,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << enable_ir_optim_;
   ss << ir_debug_;
 
+  ss << use_optimized_model_;
+
   ss << specify_input_name_;
   ss << cpu_math_library_num_threads_;
 
@@ -1471,6 +1475,8 @@ std::string AnalysisConfig::Summary() {
       {"save_optimized_model", save_optimized_model_ ? "true" : "false"});
   os.InsertRow({"ir_optim", enable_ir_optim_ ? "true" : "false"});
   os.InsertRow({"ir_debug", ir_debug_ ? "true" : "false"});
+  os.InsertRow(
+      {"use_optimized_model", use_optimized_model_ ? "true" : "false"});
   os.InsertRow({"memory_optim", enable_memory_optim_ ? "true" : "false"});
   os.InsertRow({"enable_profile", with_profile_ ? "true" : "false"});
   os.InsertRow({"enable_log", with_glog_info_ ? "true" : "false"});
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index dd922cd45dd34..4ff85d08ff1d1 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -415,6 +415,24 @@ bool AnalysisPredictor::Init(
   // no matter with or without MKLDNN
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 
+  // Use Optimized model to inference
+  if (config_.use_optimized_model_) {
+    std::string optimized_model_path = GetOptimizedModelPath();
+    std::string optimized_model = optimized_model_path + ".pdmodel";
+    std::string optimized_params = optimized_model_path + ".pdiparams";
+    if (FileExists(optimized_model) && FileExists(optimized_params)) {
+      config_.SetModel(optimized_model, optimized_params);
+      LOG(INFO) << "Load Optimized model from " << optimized_model_path;
+    } else {
+      LOG(WARNING)
+          << "The optimized model is not found, fallback to original model. "
+             "EnableSaveOptimModel will be turned on and the optimized model "
+             "can be available next time.";
+      config_.EnableSaveOptimModel(true);
+      config_.UseOptimizedModel(false);
+    }
+  }
+
   if (!PrepareScope(parent_scope)) {
     return false;
   }
@@ -554,6 +572,55 @@ void AnalysisPredictor::InitPlace() {
   }
 }
 
+std::string AnalysisPredictor::GetOptimizedModelPath() {
+  std::string model_opt_cache_dir = config_.opt_cache_dir_;
+  if (!model_opt_cache_dir.empty()) {
+    if (!PathExists(model_opt_cache_dir)) {
+      PADDLE_ENFORCE_NE(
+          MKDIR(model_opt_cache_dir.c_str()),
+          -1,
+          platform::errors::PreconditionNotMet(
+              "Can not create optimize cache directory: %s, Make sure you "
+              "have permission to write",
+              model_opt_cache_dir));
+    }
+  } else {
+    model_opt_cache_dir =
+        !config_.model_dir().empty()
+            ? config_.model_dir()
+            : inference::analysis::GetDirRoot(config_.prog_file());
+  }
+  return model_opt_cache_dir + "/" + "_optimized";
+}
+
+void AnalysisPredictor::ClearExtraParams() {
+  auto var_names = scope_->LocalVarNames();
+  std::vector<std::string> trt_repetitive_params;
+  for (auto &op_desc : inference_program_->Block(0).AllOps()) {
+    if (op_desc->Type() == "tensorrt_engine") {
+      auto trt_params = PADDLE_GET_CONST(std::vector<std::string>,
+                                         op_desc->GetAttr("parameters"));
+      trt_repetitive_params.insert(
+          trt_repetitive_params.end(), trt_params.begin(), trt_params.end());
+    }
+  }
+
+  std::vector<std::string> extra_params;
+  for (auto &var_desc : inference_program_->Block(0).AllVars()) {
+    if (var_desc->Persistable()) {
+      // Clear repetitive parameters in tensorrt
+      if (scope_->FindVar(var_desc->Name()) &&
+          std::count(trt_repetitive_params.begin(),
+                     trt_repetitive_params.end(),
+                     var_desc->Name())) {
+        extra_params.emplace_back(var_desc->Name());
+      }
+    }
+  }
+  scope_->EraseVars(extra_params);
+  VLOG(1) << "Clear " << extra_params.size() << " extra params.";
+}
+
 void AnalysisPredictor::InitResourceManager(void *stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   predictor_stream_ =
@@ -701,7 +768,17 @@ bool AnalysisPredictor::PrepareProgram(
     // not be executed.
     model_precision_ =
         paddle::inference::GetModelPrecision(*inference_program_);
-    OptimizeInferenceProgram();
+    if (config_.use_optimized_model_) {
+      LoadParameters();
+      ClearExtraParams();
+#ifdef PADDLE_WITH_CUDA
+      if (config_.use_gpu()) {
+        paddle::platform::EmptyCache();
+      }
+#endif
+    } else {
+      OptimizeInferenceProgram();
+    }
   } else {
     // If the program is passed from external, no need to optimize it, this
     // logic is used in the clone scenario.
@@ -1600,6 +1677,7 @@ void AnalysisPredictor::PrepareArgument() {
     argument_->SetModelProgramPath(config_.prog_file());
     argument_->SetModelParamsPath(config_.params_file());
   }
+  argument_->SetOptimizedModelSavePath(GetOptimizedModelPath());
   // For JITLayer
   argument_->SetSkipLoadParams(config_.skip_load_params_);
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 24e8cc1cbe850..1c107e936d69a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -494,6 +494,8 @@ class AnalysisPredictor : public PaddlePredictor {
   void InitPlace();
   void InitDeviceContexts();
   void InitResourceManager(void *stream);
+  std::string GetOptimizedModelPath();
+  void ClearExtraParams();
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet exe related
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index e69710e93c8f5..5f187e3cb7a22 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -606,7 +606,6 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \return bool Whether to use ir graph optimization.
   ///
   bool ir_optim() const { return enable_ir_optim_; }
-
   ///
   /// \brief INTERNAL Determine whether to use the feed and fetch operators.
   /// Just for internal development, not stable yet.
@@ -881,6 +880,13 @@ struct PD_INFER_DECL AnalysisConfig {
 
   bool new_executor_enabled() const { return use_new_executor_; }
 
+  ///
+  /// \brief Control whether to use optimized model to inference.
+  ///
+  /// \param x whether to use optimized model.
+  ///
+  void UseOptimizedModel(bool x = true) { use_optimized_model_ = x; }
+
   void EnableDlnne(
       int min_subgraph_size = 3,
       int max_batch_size = 1,
@@ -1316,6 +1322,8 @@ struct PD_INFER_DECL AnalysisConfig {
   bool enable_ir_optim_{true};
   bool ir_debug_{false};
 
+  bool use_optimized_model_{false};
+
   bool use_new_executor_{false};
 
   bool specify_input_name_{false};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index ece8506159921..2318c88741f28 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -113,13 +113,14 @@ class PD_INFER_DECL PaddlePassBuilder {
 
  protected:
   /// \cond Protected
-  std::vector<std::string> analysis_passes_{
-      {"ir_graph_build_pass",
-       "ir_analysis_pass",
-       "save_optimized_model_pass",
-       "ir_params_sync_among_devices_pass",
-       "adjust_cudnn_workspace_size_pass",
-       "inference_op_replace_pass"}};
+  std::vector<std::string> analysis_passes_{{
+      "ir_graph_build_pass",
+      "ir_analysis_pass",
+      "ir_params_sync_among_devices_pass",
+      "adjust_cudnn_workspace_size_pass",
+      "inference_op_replace_pass",
+      "save_optimized_model_pass",
+  }};
   std::vector<std::string> passes_;
   std::unordered_set<std::string> deleted_passes_;
   /// \endcond
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 524cb46f21a60..2072bb3802cdd 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -861,6 +861,9 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::SwitchIrOptim,
            py::arg("x") = true)
       .def("ir_optim", &AnalysisConfig::ir_optim)
+      .def("use_optimized_model",
+           &AnalysisConfig::UseOptimizedModel,
+           py::arg("x") = true)
       .def("enable_memory_optim",
            &AnalysisConfig::EnableMemoryOptim,
            py::arg("x") = true)
diff --git a/test/ir/inference/test_use_optimized_model_api.py b/test/ir/inference/test_use_optimized_model_api.py
new file mode 100644
index 0000000000000..cdfcb705e8a9c
--- /dev/null
+++ b/test/ir/inference/test_use_optimized_model_api.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from inference_pass_test import InferencePassTest
+
+import paddle
+from paddle.inference import Config, create_predictor
+
+# -------------------------- TestNet --------------------------
+#            x
+#          /   \
+#     conv2d    \                                  x
+#       |        \        IR/Pass                /   \
+#   batch_norm  conv2d    ——————>   tensorrt_engine  conv2d
+#       |        /                               \   /
+#     relu      /                            elemenwise_add
+#         \    /                                   |
+#     elemenwise_add                               y
+#           |
+#           y
+# -------------------------------------------------------------
+
+
+class TestNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = paddle.nn.Conv2D(3, 6, kernel_size=3, bias_attr=False)
+        self.bn1 = paddle.nn.BatchNorm2D(6)
+        self.relu = paddle.nn.ReLU()
+        self.conv2 = paddle.nn.Conv2D(3, 6, kernel_size=3, bias_attr=False)
+
+    def forward(self, x):
+        x1 = self.conv1(x)
+        x1 = self.bn1(x1)
+        x1 = self.relu(x1)
+        x2 = self.conv2(x)
+        y = paddle.add(x1, x2)
+        return y
+
+
+class UseOptimizedModel(InferencePassTest):
+    def setUp(self):
+        paddle.disable_static()
+        self.test_model = TestNet()
+        self.input_data = (np.ones([1, 3, 32, 32])).astype('float32')
+        self.path_prefix = "inference_test_models/use_optimized_model_test"
+        self.cache_dir = "inference_test_models/cache"
+        paddle.jit.save(
+            self.test_model,
+            self.path_prefix,
+            input_spec=[
+                paddle.static.InputSpec(shape=[1, 3, 32, 32], dtype='float32')
+            ],
+        )
+
+    def test_check_output(self):
+        out_origin_model = self.inference()
+        out_optimized_model = self.inference()
+        np.testing.assert_allclose(
+            out_origin_model, out_optimized_model, rtol=1e-5, atol=1e-2
+        )
+
+    def inference(self):
+        # Config
+        config = Config(
+            self.path_prefix + ".pdmodel", self.path_prefix + ".pdiparams"
+        )
+        # if core.is_compiled_with_cuda():
+        config.enable_use_gpu(100, 0)
+        config.enable_tensorrt_engine(
+            workspace_size=1 << 30,
+            max_batch_size=1,
+            min_subgraph_size=1,
+            precision_mode=paddle.inference.PrecisionType.Float32,
+            use_static=True,
+            use_calib_mode=False,
+        )
+        config.enable_tuned_tensorrt_dynamic_shape()
+        config.exp_disable_tensorrt_ops(["elementwise_add"])
+        config.set_optim_cache_dir(self.cache_dir)
+        config.use_optimized_model(True)
+
+        # predictor
+        predictor = create_predictor(config)
+
+        # inference
+        input_tensor = predictor.get_input_handle(
+            predictor.get_input_names()[0]
+        )
+        input_tensor.reshape(self.input_data.shape)
+        input_tensor.copy_from_cpu(self.input_data.copy())
+        predictor.run()
+        output_tensor = predictor.get_output_handle(
+            predictor.get_output_names()[0]
+        )
+        out = output_tensor.copy_to_cpu()
+        out = np.array(out).flatten()
+        return out
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4300e32c27ebbf6cb18964ce3ddc29fc4ffa9626 Mon Sep 17 00:00:00 2001
From: lanxianghit <47554610+lanxianghit@users.noreply.github.com>
Date: Mon, 26 Feb 2024 15:07:32 +0800
Subject: [PATCH 81/82] [PIR][DynamicShape] Add InferSymbolicShape for
 top_p_sampling & feed op (#62011)

* Add InferSymbolicShape for top_p_sampling, feed, select_input, where
---
 .../paddle_op_infer_sym.cc                    | 42 +++++++++++++++++--
 .../dialect/operator/ir/control_flow_op.cc    | 42 +++++++++++++++++++
 .../pir/dialect/operator/ir/control_flow_op.h |  4 +-
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
index 0b1dff55f4c41..86580325ba12a 100644
--- a/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/paddle_op_infer_sym.h"
+#include "paddle/common/ddim.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_sym_utils.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 
@@ -1005,8 +1006,9 @@ bool MaxOpInferSymbolicShape(pir::Operation *op,
 
 bool WhereOpInferSymbolicShape(pir::Operation *op,
                                pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0),
+      shape_analysis->GetShapeOrDataForValue(op->operand_source(0)));
   return true;
 }
 
@@ -1017,7 +1019,21 @@ bool Where_OpInferSymbolicShape(
 
 bool FeedOpInferSymbolicShape(pir::Operation *op,
                               pir::ShapeConstraintIRAnalysis *shape_analysis) {
-  // This Op has NO InferMeta in yaml, just return true
+  const common::DDim &result_dims =
+      op->result(0).type().dyn_cast<pir::DenseTensorType>().dims();
+  std::vector<symbol::DimExpr> out_dims;
+  for (int i = 0; i < result_dims.size(); i++) {
+    if (result_dims[i] == -1) {
+      out_dims.emplace_back(shape_analysis->GetNextSymName());
+    } else {
+      out_dims.emplace_back(result_dims[i]);
+    }
+  }
+
+  shape_analysis->SetShapeOrDataForValue(
+      op->result(0),
+      symbol::ShapeOrDataDimExprs{symbol::TensorShapeOrDataDimExprs(out_dims)});
+
   return true;
 }
 
@@ -1025,6 +1041,26 @@ bool TopPSamplingOpInferSymbolicShape(
     pir::Operation *op, pir::ShapeConstraintIRAnalysis *shape_analysis) {
   PADDLE_THROW(phi::errors::Unimplemented(
       op->name() + " 's InferSymbolicShape interface is NOT implemented now."));
+
+  const auto &x_dims = [op, shape_analysis] {
+    const auto &shape_or_data =
+        shape_analysis->GetShapeOrDataForValue(op->operand_source(0));
+    if (shape_or_data.data().has_value()) {
+      return shape_or_data.data().value();
+    } else {
+      return shape_or_data.shape();
+    }
+  }();
+
+  // all the result have the same shape
+  for (uint32_t rst_idx = 0; rst_idx < op->num_results(); rst_idx++) {
+    const std::vector<symbol::DimExpr> out_dims{x_dims[0], 1};
+    shape_analysis->SetShapeOrDataForValue(
+        op->result(rst_idx),
+        symbol::ShapeOrDataDimExprs{
+            symbol::TensorShapeOrDataDimExprs(out_dims)});
+  }
+
   return true;
 }
 
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
index 7f3929d0b9967..92ec95b6b65f6 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.cc
@@ -989,6 +989,48 @@ void SelectInputOp::VerifySig() {
   VLOG(4) << "End Verifying for: AssignArray_Op.";
 }
 
+bool SelectInputOp::InferSymbolicShape(
+    pir::ShapeConstraintIRAnalysis *shape_analysis) {
+  auto GetSymExprForValue =
+      [shape_analysis](pir::Value val) -> const std::vector<symbol::DimExpr> & {
+    const auto &shape_or_data = shape_analysis->GetShapeOrDataForValue(val);
+    if (shape_or_data.data().has_value()) {
+      return shape_or_data.data().value();
+    } else {
+      return shape_or_data.shape();
+    }
+  };
+
+  const auto &input1_dims = GetSymExprForValue(operand_source(0));
+  const auto &input2_dims = GetSymExprForValue(operand_source(1));
+
+  std::vector<symbol::DimExpr> out_dims = input1_dims;
+  // merge shape for input1 and input2, since we don't know which will be
+  // selected in compile time, the strategy is same with IfOp, see IfOp's
+  // comments for details and examples
+  if (input2_dims.size() != 0) {
+    // now only support input1 and input2 have same rank.
+    PADDLE_ENFORCE_EQ(input1_dims.size(),
+                      input2_dims.size(),
+                      phi::errors::PreconditionNotMet(
+                          "The true and false block should have same rank, "
+                          "but got true_rank(%d) and false_rank(%d)",
+                          input1_dims.size(),
+                          input2_dims.size()));
+    for (size_t i = 0; i < input1_dims.size(); i++) {
+      if (input1_dims[i] != input2_dims[i]) {
+        out_dims[i] = symbol::DimExpr{shape_analysis->GetNextSymName()};
+      }
+    }
+  }
+
+  shape_analysis->SetShapeOrDataForValue(
+      result(0),
+      symbol::ShapeOrDataDimExprs{symbol::TensorShapeOrDataDimExprs(out_dims)});
+
+  return true;
+}
+
 void SelectOutputOp::VerifySig() {
   VLOG(4) << "Verifying inputs, outputs and attributes for: SelectOutputOp.";
   VLOG(4) << "Verifying inputs:";
diff --git a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
index f8a6bbb9f3b0f..8b5af449d4820 100644
--- a/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
+++ b/paddle/fluid/pir/dialect/operator/ir/control_flow_op.h
@@ -193,7 +193,8 @@ class AssertOp : public pir::Op<AssertOp, OpYamlInfoInterface> {
   pir::Value data() { return operand_source(1); }
 };
 
-class SelectInputOp : public pir::Op<SelectInputOp> {
+class SelectInputOp
+    : public pir::Op<SelectInputOp, InferSymbolicShapeInterface> {
  public:
   using Op::Op;
   static const char *name() { return "pd_op.select_input"; }
@@ -202,6 +203,7 @@ class SelectInputOp : public pir::Op<SelectInputOp> {
   void VerifySig();
   pir::Value mask() { return operand_source(0); }
   pir::Value out() { return result(0); }
+  bool InferSymbolicShape(pir::ShapeConstraintIRAnalysis *shape_analysis);
 };
 
 class SelectOutputOp : public pir::Op<SelectOutputOp> {

From ae2d4b96cde2160d4abf606b1701486f70df5868 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Mon, 26 Feb 2024 15:08:29 +0800
Subject: [PATCH 82/82] [PIR] Fix conv2d_add_(act)_fuse_pass (#61979)

* fix conv2d_add_fuse_pass

* fix

* add comment
---
 .../fluid/inference/api/analysis_predictor.cc |   4 +-
 .../fluid/pir/dialect/op_generator/op_gen.py  |  58 ++++---
 .../pir/dialect/op_generator/ops_api_gen.py   |   4 +-
 .../fusion/conv2d_add_act_fuse_pass.cc        |   4 +
 .../transforms/fusion/conv2d_add_fuse_pass.cc |   7 +
 .../fusion/fused_weight_only_linear_pass.cc   |   3 +-
 .../transforms/transform_general_functions.cc |  20 ++-
 .../transforms/transform_general_functions.h  |  10 ++
 .../test_conv2d_add_act_fuse_pass.py          |  28 ++-
 .../fused_pass/test_conv2d_add_fuse_pass.py   |  23 ++-
 .../fused_pass/test_conv2d_bias_fuse_pass.py  | 163 ------------------
 11 files changed, 109 insertions(+), 215 deletions(-)
 delete mode 100644 test/ir/pir/fused_pass/test_conv2d_bias_fuse_pass.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4ff85d08ff1d1..299e69d628745 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -904,13 +904,13 @@ bool AnalysisPredictor::PrepareExecutor() {
         params_sync_among_devices_pass->SetNotOwned(pir::kPlaceAttr, &place_);
         params_sync_among_devices_pass->SetNotOwned(pir::kParamScopeAttr,
                                                     sub_scope_);
+        gpu_pm.AddPass(std::move(params_sync_among_devices_pass));
 
         auto constant_folding_pass = ::pir::CreateConstantFoldingPass();
         constant_folding_pass->SetNotOwned(pir::kPlaceAttr, &place_);
         constant_folding_pass->SetNotOwned(pir::kParamScopeAttr, sub_scope_);
-
-        gpu_pm.AddPass(std::move(params_sync_among_devices_pass));
         gpu_pm.AddPass(std::move(constant_folding_pass));
+
         gpu_pm.AddPass(::pir::CreateDeadCodeEliminationPass());
         gpu_pm.AddPass(::pir::CreateReplaceFetchWithShadowOutputPass());
         //----------------------------------------------------------------------------------------------//
diff --git a/paddle/fluid/pir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
index 55da686d2a3b1..40dc916d4f4ad 100644
--- a/paddle/fluid/pir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -86,25 +86,27 @@
 #pragma once
 #include <vector>
 
-#include "paddle/pir/include/core/builder.h"
-#include "paddle/pir/include/core/operation_utils.h"
+#include "paddle/fluid/pir/dialect/operator/interface/decomp.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infer_symbolic_shape/infer_symbolic_shape.h"
-#include "paddle/pir/include/core/op_base.h"
-#include "paddle/pir/include/core/op_trait.h"
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
-#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
-#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
-#include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/fluid/pir/dialect/operator/interface/parse_kernel_key.h"
-#include "paddle/fluid/pir/dialect/operator/interface/decomp.h"
+#include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
 #include "paddle/fluid/pir/dialect/operator/trait/inplace.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/pir/include/core/builder.h"
+#include "paddle/pir/include/core/op_base.h"
+#include "paddle/pir/include/core/op_trait.h"
+#include "paddle/pir/include/core/operation_utils.h"
+#ifdef PADDLE_WITH_DNNL
 #include "paddle/fluid/pir/dialect/operator/trait/onednn.h"
-#include "paddle/fluid/pir/dialect/operator/trait/custom_vjp.h"
+#endif
 #include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/pir/dialect/operator/trait/custom_vjp.h"
+#include "paddle/phi/core/infermeta_utils.h"
 {only_pd_op_header_files}
 
 {other_info}
@@ -176,27 +178,27 @@ class {TEST_API} {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
 # =====================================
 CC_FILE_TEMPLATE = """// This file is generated by "paddle/fluid/pir/dialect/op_generator/op_gen.py"
 #include "{h_file}"
-#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
-#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
-#include "paddle/fluid/pir/dialect/operator/ir/ir_tensor.h"
-#include "paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.h"
 #include "paddle/fluid/pir/dialect/operator/ir/ir_meta_tensor.h"
-#include "paddle/pir/include/core/builtin_attribute.h"
-#include "paddle/pir/include/core/builtin_type.h"
-#include "paddle/pir/include/core/builtin_op.h"
-#include "paddle/pir/include/core/ir_context.h"
-#include "paddle/phi/core/enforce.h"
+#include "paddle/fluid/pir/dialect/operator/ir/ir_selected_rows.h"
+#include "paddle/fluid/pir/dialect/operator/ir/ir_tensor.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/primitive/rule/vjp/vjp.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/fusion.h"
 #include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/infermeta/nullary.h"
-#include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/infermeta/ternary.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/fusion.h"
-#include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/fluid/primitive/rule/vjp/vjp.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/pir/include/core/builtin_attribute.h"
+#include "paddle/pir/include/core/builtin_op.h"
+#include "paddle/pir/include/core/builtin_type.h"
+#include "paddle/pir/include/core/ir_context.h"
 #include "paddle/pir/include/core/op_base.h"
 
 using namespace paddle::dialect;
@@ -237,12 +239,12 @@ class {TEST_API} {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
 VJP_CC_FILE_TEMPLATE = """// This file is generated by "paddle/fluid/pir/dialect/op_generator/op_gen.py"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/fluid/primitive/rule/vjp/vjp.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
+#include "paddle/phi/common/int_array.h"
 #include "paddle/pir/include/core/builtin_op.h"
 #include "paddle/pir/include/core/op_base.h"
-#include "paddle/phi/common/int_array.h"
-#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 
 namespace paddle {{
 namespace dialect {{
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index 8328e406ae0e6..5dbbf62642e97 100644
--- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -20,11 +20,11 @@
 CPP_FILE_TEMPLATE = """
 #include <pybind11/pybind11.h>
 
-#include "paddle/fluid/pybind/static_op_function.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/pybind/eager_op_function.h"
 #include "paddle/fluid/pybind/manual_static_op_function.h"
+#include "paddle/fluid/pybind/static_op_function.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/fluid/eager/api/utils/global_utils.h"
 
 {body}
 
diff --git a/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc b/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
index c7710721350eb..9e950dc2d11b9 100644
--- a/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/conv2d_add_act_fuse_pass.cc
@@ -45,6 +45,8 @@ class Conv2dAddActFusePattern
     pir::Value add_input = op.x();
     IR_ENFORCE(add_input == conv2d_out);
 
+    if (!pir::ValueIsPersitable(op.y())) return false;
+
     pir::Value add_out = op.out();
     if (!add_out.HasOneUse()) return false;
 
@@ -117,6 +119,8 @@ class Conv2dAdd2ActFusePattern
                                          ->dyn_cast<paddle::dialect::AddOp>();
     if (!add1_op) return false;
 
+    if (!pir::ValueIsPersitable(add1_op.y())) return false;
+
     pir::Value add1_out = add1_op.out();
     if (!add1_out.HasOneUse()) return false;
 
diff --git a/paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.cc b/paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.cc
index c5eb8134d05c4..9c1cec5b9b645 100644
--- a/paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/conv2d_add_fuse_pass.cc
@@ -19,6 +19,9 @@
 #include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/pir/drr/include/drr_pattern_base.h"
 
+#include "paddle/fluid/pir/transforms/transform_general_functions.h"
+#include "paddle/pir/include/core/builtin_op.h"
+#include "paddle/pir/include/core/value.h"
 #include "paddle/pir/include/pass/pass.h"
 #include "paddle/pir/include/pass/pass_registry.h"
 
@@ -44,6 +47,10 @@ class Conv2dAddFusePattern : public paddle::drr::DrrPatternBase {
     pat.Tensor("add_out") = add(pat.Tensor("conv2d_out"), pat.Tensor("bias"));
     pat.RequireNativeCall(
         [](const paddle::drr::MatchContext &match_ctx) -> bool {
+          if (!pir::ValueIsPersitable(match_ctx.Tensor("bias"))) {
+            return false;
+          }
+
           auto padding_algorithm =
               match_ctx.Attr<std::string>("padding_algorithm");
           if (padding_algorithm != "EXPLICIT" && padding_algorithm != "SAME" &&
diff --git a/paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.cc b/paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.cc
index 6922691684d30..bf4ea92af67b2 100644
--- a/paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.cc
+++ b/paddle/fluid/pir/transforms/fusion/fused_weight_only_linear_pass.cc
@@ -50,8 +50,7 @@ class FusedWeightOnlyLinearPattern : public paddle::drr::DrrPatternBase {
         src.Op(paddle::dialect::MatmulOp::name(),
                {{"transpose_x", src.Attr("matmul_transpose_x")},
                 {"transpose_y", src.Attr("matmul_transpose_y")}});
-    const auto &parameter = src.Op(
-        pir::ParameterOp::name(), {{"parameter_name", src.Attr("param_name")}});
+    const auto &parameter = src.Op(pir::ParameterOp::name());
     src.Tensor("w") = parameter();
     src.Tensor("matmul_out") = matmul(src.Tensor("x"), src.Tensor("w"));
     const auto &add = src.Op(paddle::dialect::AddOp::name());
diff --git a/paddle/fluid/pir/transforms/transform_general_functions.cc b/paddle/fluid/pir/transforms/transform_general_functions.cc
index 92fc8ba68ddb7..55a1dc463dc6d 100644
--- a/paddle/fluid/pir/transforms/transform_general_functions.cc
+++ b/paddle/fluid/pir/transforms/transform_general_functions.cc
@@ -20,6 +20,7 @@
 
 #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/pir/include/core/builtin_op.h"
 #include "paddle/pir/include/core/op_operand.h"
 #include "paddle/pir/include/core/parameter.h"
@@ -116,7 +117,7 @@ std::vector<std::pair<Operation*, int32_t>> GetUseOpsForOutput(
   auto result = op->result(index);
   std::vector<std::pair<Operation*, int32_t>> use_ops;
   for (auto it = result.use_begin(); it != result.use_end(); ++it) {
-    use_ops.push_back(std::make_pair(it->owner(), it->index()));
+    use_ops.emplace_back(it->owner(), it->index());
   }
   return use_ops;
 }
@@ -138,4 +139,21 @@ std::vector<pir::Value> GetUsedExternalValue(const pir::Block& block) {
   return used_values;
 }
 
+bool ValueIsPersitable(pir::Value value) {
+  if (value.defining_op()->num_operands() > 0) {
+    for (const auto& source_value : value.defining_op()->operands_source()) {
+      if (!ValueIsPersitable(source_value)) {
+        return false;
+      }
+    }
+  } else {
+    if (!value.defining_op()->isa<pir::ParameterOp>() &&
+        !value.defining_op()->isa<paddle::dialect::FullOp>() &&
+        !value.defining_op()->isa<paddle::dialect::FullIntArrayOp>()) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace pir
diff --git a/paddle/fluid/pir/transforms/transform_general_functions.h b/paddle/fluid/pir/transforms/transform_general_functions.h
index 8b9ffdd8cf477..d34c6d6863802 100644
--- a/paddle/fluid/pir/transforms/transform_general_functions.h
+++ b/paddle/fluid/pir/transforms/transform_general_functions.h
@@ -95,4 +95,14 @@ std::vector<Value> GetUsedExternalValue(const Operation& op);
  */
 std::vector<Value> GetUsedExternalValue(const Block& block);
 
+/**
+ * @brief Determine whether a value comes from a weight or has no input op. That
+ is to say, it is permissible.
+ *
+ * @param pir::Value
+
+ * @return bool
+ */
+bool ValueIsPersitable(pir::Value value);
+
 }  // namespace pir
diff --git a/test/ir/pir/fused_pass/test_conv2d_add_act_fuse_pass.py b/test/ir/pir/fused_pass/test_conv2d_add_act_fuse_pass.py
index ca397cbebce82..aaaf7cb175497 100644
--- a/test/ir/pir/fused_pass/test_conv2d_add_act_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_conv2d_add_act_fuse_pass.py
@@ -19,21 +19,22 @@
 
 import paddle
 from paddle.base import core
+from paddle.pir.core import create_parameter
 
 paddle.enable_static()
 
 
 class TestConv2dAddActFusePattern(PassTest):
     r"""
-      x_var   f_var
+      x_var   f_var(w)
     \       /
        conv2d
          |
-      conv2d_var    y_var
+      conv2d_var    y_var(w)
           \          /
          elementwise_add
               |
-       elementwise_add_var
+            add_var
               |
              act
               |
@@ -59,8 +60,14 @@ def build_ir_program(self):
                     data_format='NCHW',
                     bias_attr=False,
                 )
-                y = paddle.static.data(
-                    name="y", shape=[3, 32, 28, 28], dtype="float32"
+
+                y = create_parameter(
+                    name="y",
+                    shape=[3, 32, 28, 28],
+                    dtype='float32',
+                    initializer=paddle.nn.initializer.Assign(
+                        np.random.random((3, 32, 28, 28)).astype("float32")
+                    ),
                 )
                 act_op = paddle.nn.ReLU()
                 out = act_op(paddle.add(conv2d(x), y))
@@ -68,7 +75,6 @@ def build_ir_program(self):
                 self.pass_list = ['conv2d_add_act_fuse_pass']
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
-                    "y": np.random.random((3, 32, 28, 28)).astype("float32"),
                 }
                 self.fetch_list = [out]
                 self.valid_op_map = {
@@ -130,8 +136,13 @@ def build_ir_program(self):
                     data_format='NCHW',
                     bias_attr=False,
                 )
-                y = paddle.static.data(
-                    name="y", shape=[3, 32, 28, 28], dtype="float32"
+                y = create_parameter(
+                    name="y",
+                    shape=[3, 32, 28, 28],
+                    dtype='float32',
+                    initializer=paddle.nn.initializer.Assign(
+                        np.random.random((3, 32, 28, 28)).astype("float32")
+                    ),
                 )
                 residual_data = paddle.static.data(
                     name="residual_data", shape=[3, 32, 28, 28], dtype="float32"
@@ -144,7 +155,6 @@ def build_ir_program(self):
                 self.pass_list = ['conv2d_add_act_fuse_pass']
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
-                    "y": np.random.random((3, 32, 28, 28)).astype("float32"),
                     "residual_data": np.random.random((3, 32, 28, 28)).astype(
                         "float32"
                     ),
diff --git a/test/ir/pir/fused_pass/test_conv2d_add_fuse_pass.py b/test/ir/pir/fused_pass/test_conv2d_add_fuse_pass.py
index cff8c6addd0e6..1e19364e3ba8d 100644
--- a/test/ir/pir/fused_pass/test_conv2d_add_fuse_pass.py
+++ b/test/ir/pir/fused_pass/test_conv2d_add_fuse_pass.py
@@ -19,17 +19,20 @@
 
 import paddle
 from paddle.base import core
+from paddle.pir.core import create_parameter
 
 paddle.enable_static()
 
 
 class TestConv2dAddFusePass(PassTest):
     r"""
-    x_var   f_var
+    x_var   filter(w)
       \       /
-         conv2d
-           |
-          add
+         conv2d  bias(w)
+           |    /
+            add
+             |
+            out_var
     """
 
     def is_program_valid(self, program=None):
@@ -43,8 +46,13 @@ def build_ir_program(self):
                 x = paddle.static.data(
                     name='x', shape=[3, 1, 28, 28], dtype='float32'
                 )
-                y = paddle.static.data(
-                    name="y", shape=[3, 32, 28, 28], dtype="float32"
+                bias = create_parameter(
+                    name="bias",
+                    shape=[3, 32, 28, 28],
+                    dtype='float32',
+                    initializer=paddle.nn.initializer.Assign(
+                        np.random.random((3, 32, 28, 28)).astype("float32")
+                    ),
                 )
                 conv2d = paddle.nn.Conv2D(
                     in_channels=1,
@@ -54,12 +62,11 @@ def build_ir_program(self):
                     data_format='NCHW',
                     bias_attr=False,
                 )
-                out = paddle.add(conv2d(x), y)
+                out = paddle.add(conv2d(x), bias)
                 out = paddle.assign(out)
                 self.pass_list = ['conv2d_add_fuse_pass']
                 self.feeds = {
                     "x": np.random.random((3, 1, 28, 28)).astype("float32"),
-                    "y": np.random.random((3, 32, 28, 28)).astype("float32"),
                 }
                 self.fetch_list = [out]
                 self.valid_op_map = {
diff --git a/test/ir/pir/fused_pass/test_conv2d_bias_fuse_pass.py b/test/ir/pir/fused_pass/test_conv2d_bias_fuse_pass.py
deleted file mode 100644
index 1751f58818f3f..0000000000000
--- a/test/ir/pir/fused_pass/test_conv2d_bias_fuse_pass.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from pass_test import PassTest
-
-import paddle
-
-paddle.enable_static()
-
-
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
-class TestConv2dAddFusePass(PassTest):
-    def is_program_valid(self, program=None):
-        return True
-
-    def build_ir_program(self):
-        with paddle.pir_utils.IrGuard():
-            main_prog = paddle.static.Program()
-            start_prog = paddle.static.Program()
-            with paddle.pir.core.program_guard(main_prog, start_prog):
-                x = paddle.static.data(
-                    name='x', shape=[5, 5, 5, 5], dtype='float32'
-                )
-                bias_attr = paddle.ParamAttr(
-                    learning_rate=0.0,
-                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
-                )
-                bias = paddle.static.create_parameter(
-                    shape=[1], dtype='float32', attr=bias_attr, is_bias=False
-                )
-                w_attr = paddle.ParamAttr(
-                    learning_rate=0.0,
-                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
-                )
-                conv2d = paddle.nn.Conv2D(
-                    in_channels=5,
-                    out_channels=1,
-                    kernel_size=[1, 1],
-                    groups=1,
-                    stride=[1, 1],
-                    padding=[1, 1, 1, 1],
-                    dilation=[1, 1],
-                    data_format='NCHW',
-                    bias_attr=False,
-                    weight_attr=w_attr,
-                )
-
-                out = paddle.add(conv2d(x), bias)
-                out = paddle.assign(out)
-                self.pass_list = ['conv2d_bias_fuse_pass']
-                self.feeds = {
-                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
-                    "bias": np.random.random(1).astype("float32"),
-                }
-                self.fetch_list = [out]
-                self.valid_op_map = {
-                    "onednn_op.fused_conv2d": 1,
-                    "pd_op.conv2d": 0,
-                    "pd_op.add": 0,
-                }
-                return [main_prog, start_prog]
-
-    def sample_program(self):
-        yield self.build_ir_program(), False
-
-    def setUp(self):
-        self.places.append(paddle.CPUPlace())
-
-    def test_check_output(self):
-        self.check_pass_correct()
-
-
-@unittest.skipIf(
-    not paddle.base.core.is_compiled_with_mkldnn(),
-    "Test case only for OneDNN pass.",
-)
-class TestConv2dAddFusePassWithAddParam(PassTest):
-    def is_program_valid(self, program=None):
-        return True
-
-    def build_ir_program(self):
-        with paddle.pir_utils.IrGuard():
-            main_prog = paddle.static.Program()
-            start_prog = paddle.static.Program()
-            with paddle.pir.core.program_guard(main_prog, start_prog):
-                x = paddle.static.data(
-                    name='x', shape=[5, 5, 5, 5], dtype='float32'
-                )
-                bias_attr = paddle.ParamAttr(
-                    learning_rate=0.0,
-                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
-                )
-                bias = paddle.static.create_parameter(
-                    shape=[1], dtype='float32', attr=bias_attr, is_bias=False
-                )
-                w_attr = paddle.ParamAttr(
-                    learning_rate=0.0,
-                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
-                )
-                conv2d = paddle.nn.Conv2D(
-                    in_channels=5,
-                    out_channels=1,
-                    kernel_size=[1, 1],
-                    groups=1,
-                    stride=[1, 1],
-                    padding=[1, 1, 1, 1],
-                    dilation=[1, 1],
-                    data_format='NCHW',
-                    bias_attr=False,
-                    weight_attr=w_attr,
-                )
-                add_out = paddle.add(conv2d(x), bias)
-                other_param_attr = paddle.ParamAttr(
-                    learning_rate=0.0,
-                    initializer=paddle.nn.initializer.Normal(mean=0.0, std=2.0),
-                )
-                other_param = paddle.static.create_parameter(
-                    shape=[1], dtype='float32', attr=bias_attr, is_bias=False
-                )
-                out = paddle.add(add_out, other_param)
-                out = paddle.assign(out)
-                self.pass_list = ['conv2d_bias_fuse_pass']
-                self.feeds = {
-                    "x": np.random.random((5, 5, 5, 5)).astype("float32"),
-                    "bias": np.random.random(1).astype("float32"),
-                }
-                self.fetch_list = [out]
-                self.valid_op_map = {
-                    "onednn_op.fused_conv2d": 1,
-                    "pd_op.conv2d": 0,
-                    "pd_op.add": 1,
-                }
-                return [main_prog, start_prog]
-
-    def sample_program(self):
-        yield self.build_ir_program(), False
-
-    def setUp(self):
-        self.places.append(paddle.CPUPlace())
-
-    def test_check_output(self):
-        self.check_pass_correct()
-
-
-if __name__ == "__main__":
-    unittest.main()